Example
""" A [GFF parser script][1] in Python for [www.VigiLab.org][2]
Description:
- That performs buffered reading, and filtering (see: @filter) of .GFF input file (e.g. "[./toy.gff][3]") to keep only rows whose field (column) values are equal to "transcript"...
Args:
- None (yet)
Returns:
- None (yet)
Related:
- [1]: https://github.com/a1ultima/vigilab_intergeneShareGFF/blob/master/README.md
- [2]: http://www.vigilab.org/
- [3]: https://github.com/a1ultima/vigilab_intergeneShareGFF/blob/master/toy.gff
"""
gene_to_field = {} # dict whose keys: genes represented (i.e. later slice-able/index-able) as 1..n, values, where n = 8 total #fields (cols) of a gff row, whose version is unknown but example is: https://github.com/a1ultima/vigilab_intergeneShareGFF/blob/master/toy.gff
gene_i = 0
with open("./toy.gff", "r") as fi:
print("Reading GFF file into: gene_to_field (dict), index as such: gene_to_field[gene_i], where gene_i is between 1-to-n...")
while True: # breaks once there are no more lines in the input .gff file, see "@break"
line = fi.readline().rstrip() # no need for trailing newline chars ("\n")
if line == "": # @break
break
line_split = line.split("\t") # turn a line of input data into a list, each element = different field value, e.g. [...,"transcript",...]
if line_split[2] != "transcript": # @@filter incoming rows so only those with "transcript" are not skipped by "continue"
continue
gene_i += 1 # indexing starts from 1 (i.e. [1] = first gene) ends at n
##@TEST: sometimes 4.00 instead of 4.0 (trivial) # some @deprecated code, but may be useful one day
#if not (str(line_split[5])==str(float(line_split[5]))):
# print("oops")
# print("\t"+str(line_split[5])+"___"+str(float(line_split[5])))
# create a dict key, for gene_to_field dict, and set its values according to list elements in line_split
gene_to_field[gene_i] = { \
"c1_reference_seq":line_split[0],# e.g. 'scaffold_150' \
"c2_source":line_split[1],# e.g. 'GWSUNI' \
"c3_type":line_split[2],# e.g. 'transcript' \
"c4_start":int(line_split[3]),# e.g. '1372' \
"c5_end":int(line_split[4]),# e.g. '2031' \
"c6_score":float(line_split[5]),# e.g. '45.89' \
"c7_strand":line_split[6],# e.g. '+' \
"c8_phase":line_split[7],# e.g. '.' @Note: codon frame (0,1,2) \
"c9_attributes":line_split[8]# e.g. <see @gff3.md> \
}