Exporting BioCantor data models
BioCantor data models can be exported to any of:
GenBank
GFF3
JSON
BED (TranscriptInterval and FeatureInterval only).
The JSON representation can be read directly by the marshmallow
data structures that build the data model.
[ ]:
from inscripta.biocantor.io.gff3.parser import parse_standard_gff3, AnnotationCollectionModel
gff3 = "tests/data/INSC1006_chrI.gff3"
model = list(parse_standard_gff3(gff3))[0]
parsed = model.to_annotation_collection()
GFF3
Each of the five interval objects in BioCantor is capable of directly printing to GFF3. If this function is called on a parent object, each child is also recursively called.
[ ]:
for gff_row in parsed.to_gff():
print(gff_row)
CM021111.1 BioCantor gene 16175 18079 . + . ID=95dcc29c-0b5c-db9a-a1dc-83e2b81a7ccc;gene_biotype=ncRNA;gene_id=8ad3f444-384e-35e0-e560-aef88bd2863f;locus_tag=GI526_G0000001
CM021111.1 BioCantor transcript 16175 18079 . - . ID=90ee4e5b-64de-11fb-5d87-7a98577463bb;Parent=95dcc29c-0b5c-db9a-a1dc-83e2b81a7ccc;Name=GI526_G0000001;gene_biotype=ncRNA;gene_id=8ad3f444-384e-35e0-e560-aef88bd2863f;locus_tag=GI526_G0000001;ncrna_class=other;note=CAT%20transcript%20id:%20T0000001%3B%20CAT%20alignment%20id:%20IsoSeq-PB.2586.1%3B%20CAT%20novel%20prediction:%20IsoSeq;transcript_biotype=ncRNA;transcript_id=GI526_G0000001;transcript_name=GI526_G0000001
CM021111.1 BioCantor exon 16175 18079 . - . ID=exon-90ee4e5b-64de-11fb-5d87-7a98577463bb-1;Parent=90ee4e5b-64de-11fb-5d87-7a98577463bb;Name=GI526_G0000001;gene_biotype=ncRNA;gene_id=8ad3f444-384e-35e0-e560-aef88bd2863f;locus_tag=GI526_G0000001;ncrna_class=other;note=CAT%20transcript%20id:%20T0000001%3B%20CAT%20alignment%20id:%20IsoSeq-PB.2586.1%3B%20CAT%20novel%20prediction:%20IsoSeq;transcript_biotype=ncRNA;transcript_id=GI526_G0000001;transcript_name=GI526_G0000001
CM021111.1 BioCantor gene 37462 39103 . + . ID=171872e6-a062-50f3-c18a-239a37b8d865;Name=GDH3;gene=GDH3;gene_biotype=protein_coding;gene_id=a1b669f1-57f6-ae9b-8f4f-a27a6e84d15a;gene_name=GDH3;locus_tag=GI526_G0000002
CM021111.1 BioCantor transcript 37462 39103 . + . ID=e412edc4-ce6a-2879-80b4-0382186de445;Parent=171872e6-a062-50f3-c18a-239a37b8d865;Name=GDH3;codon_start=1;gene=GDH3;gene_biotype=protein_coding;gene_id=a1b669f1-57f6-ae9b-8f4f-a27a6e84d15a;gene_name=GDH3;locus_tag=GI526_G0000002;note=CAT%20transcript%20id:%20T0000002%3B%20CAT%20alignment%20id:%20NM_001178204.1-0%3B%20CAT%20source%20transcript%20id:%20NM_001178204.1%3B%20CAT%20source%20transcript%20biotype:%20protein_coding;protein_id=KAF1903245.1;transcript_biotype=protein_coding;transcript_id=GI526_G0000002;transcript_name=GDH3;translation=MTSEPEFQQAYDEIVSSVEDSKIFEKFPQYKKVLPIVSVPERIIQFRVTWENDNGEQEVAQGYRVQFNSAKGPYKGGLRFHPSVNLSILKFLGFEQIFKNALTGLDMGGGKGGLCVDLKGKSDNEIRRICYAFMRELSRHIGKDTDVPAGDIGVGGREIGYLFGAYRSYKNSWEGVLTGKGLNWGGSLIRPEATGFGLVYYTQAMIDYATNGKESFEGKRVTISGSGNVAQYAALKVIELGGIVVSLSDSKGCIISETGITSEQIHDIASAKIRFKSLEEIVDEYSTFSESKMKYVAGARPWTHVSNVDIALPCATQNEVSGDEAKALVASGVKFVAEGANMGSTPEAISVFETARSTATNAKDAVWFGPPKAANLGGVAVSGLEMAQNSQKVTWTAERVDQELKKIMINCFNDCIQAAQEYSTEKNTNTLPSLVKGANIASFVMVADAMLDQGDVF
CM021111.1 BioCantor exon 37462 39103 . + . ID=exon-e412edc4-ce6a-2879-80b4-0382186de445-1;Parent=e412edc4-ce6a-2879-80b4-0382186de445;Name=GDH3;codon_start=1;gene=GDH3;gene_biotype=protein_coding;gene_id=a1b669f1-57f6-ae9b-8f4f-a27a6e84d15a;gene_name=GDH3;locus_tag=GI526_G0000002;note=CAT%20transcript%20id:%20T0000002%3B%20CAT%20alignment%20id:%20NM_001178204.1-0%3B%20CAT%20source%20transcript%20id:%20NM_001178204.1%3B%20CAT%20source%20transcript%20biotype:%20protein_coding;protein_id=KAF1903245.1;transcript_biotype=protein_coding;transcript_id=GI526_G0000002;transcript_name=GDH3;translation=MTSEPEFQQAYDEIVSSVEDSKIFEKFPQYKKVLPIVSVPERIIQFRVTWENDNGEQEVAQGYRVQFNSAKGPYKGGLRFHPSVNLSILKFLGFEQIFKNALTGLDMGGGKGGLCVDLKGKSDNEIRRICYAFMRELSRHIGKDTDVPAGDIGVGGREIGYLFGAYRSYKNSWEGVLTGKGLNWGGSLIRPEATGFGLVYYTQAMIDYATNGKESFEGKRVTISGSGNVAQYAALKVIELGGIVVSLSDSKGCIISETGITSEQIHDIASAKIRFKSLEEIVDEYSTFSESKMKYVAGARPWTHVSNVDIALPCATQNEVSGDEAKALVASGVKFVAEGANMGSTPEAISVFETARSTATNAKDAVWFGPPKAANLGGVAVSGLEMAQNSQKVTWTAERVDQELKKIMINCFNDCIQAAQEYSTEKNTNTLPSLVKGANIASFVMVADAMLDQGDVF
CM021111.1 BioCantor CDS 37638 39011 . + 0 ID=8c412dbe-9c30-69d5-b15d-9c973b274d1c-1;Parent=e412edc4-ce6a-2879-80b4-0382186de445;Name=KAF1903245.1;codon_start=1;gene=GDH3;gene_biotype=protein_coding;gene_id=a1b669f1-57f6-ae9b-8f4f-a27a6e84d15a;gene_name=GDH3;locus_tag=GI526_G0000002;note=CAT%20transcript%20id:%20T0000002%3B%20CAT%20alignment%20id:%20NM_001178204.1-0%3B%20CAT%20source%20transcript%20id:%20NM_001178204.1%3B%20CAT%20source%20transcript%20biotype:%20protein_coding;product=GDH3%20isoform%201;protein_id=KAF1903245.1;transcript_biotype=protein_coding;transcript_id=GI526_G0000002;transcript_name=GDH3;translation=MTSEPEFQQAYDEIVSSVEDSKIFEKFPQYKKVLPIVSVPERIIQFRVTWENDNGEQEVAQGYRVQFNSAKGPYKGGLRFHPSVNLSILKFLGFEQIFKNALTGLDMGGGKGGLCVDLKGKSDNEIRRICYAFMRELSRHIGKDTDVPAGDIGVGGREIGYLFGAYRSYKNSWEGVLTGKGLNWGGSLIRPEATGFGLVYYTQAMIDYATNGKESFEGKRVTISGSGNVAQYAALKVIELGGIVVSLSDSKGCIISETGITSEQIHDIASAKIRFKSLEEIVDEYSTFSESKMKYVAGARPWTHVSNVDIALPCATQNEVSGDEAKALVASGVKFVAEGANMGSTPEAISVFETARSTATNAKDAVWFGPPKAANLGGVAVSGLEMAQNSQKVTWTAERVDQELKKIMINCFNDCIQAAQEYSTEKNTNTLPSLVKGANIASFVMVADAMLDQGDVF
CM021111.1 BioCantor gene 39519 40772 . + . ID=bfd63ec7-0657-1623-bcf6-155e62c4fa6d;Name=BDH2;gene=BDH2;gene_biotype=protein_coding;gene_id=4967ade5-6d91-faeb-79ed-e57093e4e5f2;gene_name=BDH2;locus_tag=GI526_G0000003
CM021111.1 BioCantor transcript 39519 40772 . + . ID=29b6f676-e957-4508-9855-e0f904f60925;Parent=bfd63ec7-0657-1623-bcf6-155e62c4fa6d;Name=BDH2;codon_start=1;gene=BDH2;gene_biotype=protein_coding;gene_id=4967ade5-6d91-faeb-79ed-e57093e4e5f2;gene_name=BDH2;locus_tag=GI526_G0000003;protein_id=KAF1903246.1;transcript_biotype=protein_coding;transcript_id=GI526_G0000003;transcript_name=BDH2;translation=MRALAYFGKGNIRFTNHLKEPHIVAPDELVIDIAWCGICGTDLHEYTDGPIFFPEDGHTHEISHNPLPQAMGHEMAGTVLEVGPSVKNLKVGDKVVVEPTGTCRDRYRWPLSPKVDKEWCAACKKGYYNICSYLGLCGAGVQSGGFAEGVVMNESHCYKVPDFVPLDVAALIQPLAVCWHAIRVCEFKAGSTALIIGAGPIGLGTILALNAAGCKDIVVSEPAKVRRELAEKMGARVYDPTAHAAKESIDYLRSIADGGDGFDYTFDCSGLEVTLNAAIQCLTFRGTAVNLAMWGHHKIQFSPMDITLHERKYTGSMCYTHHDFETVIEALEEGRIDIDRARHMITGRVNIEDGLDGAIMKLINEKESTIKIILTPNNHGELNREADNEKKEISELSSRKDQERLRESINEAKLRHT
CM021111.1 BioCantor exon 39519 40772 . + . ID=exon-29b6f676-e957-4508-9855-e0f904f60925-1;Parent=29b6f676-e957-4508-9855-e0f904f60925;Name=BDH2;codon_start=1;gene=BDH2;gene_biotype=protein_coding;gene_id=4967ade5-6d91-faeb-79ed-e57093e4e5f2;gene_name=BDH2;locus_tag=GI526_G0000003;protein_id=KAF1903246.1;transcript_biotype=protein_coding;transcript_id=GI526_G0000003;transcript_name=BDH2;translation=MRALAYFGKGNIRFTNHLKEPHIVAPDELVIDIAWCGICGTDLHEYTDGPIFFPEDGHTHEISHNPLPQAMGHEMAGTVLEVGPSVKNLKVGDKVVVEPTGTCRDRYRWPLSPKVDKEWCAACKKGYYNICSYLGLCGAGVQSGGFAEGVVMNESHCYKVPDFVPLDVAALIQPLAVCWHAIRVCEFKAGSTALIIGAGPIGLGTILALNAAGCKDIVVSEPAKVRRELAEKMGARVYDPTAHAAKESIDYLRSIADGGDGFDYTFDCSGLEVTLNAAIQCLTFRGTAVNLAMWGHHKIQFSPMDITLHERKYTGSMCYTHHDFETVIEALEEGRIDIDRARHMITGRVNIEDGLDGAIMKLINEKESTIKIILTPNNHGELNREADNEKKEISELSSRKDQERLRESINEAKLRHT
CM021111.1 BioCantor CDS 39519 40772 . + 0 ID=8d8ecb72-2b49-c26e-0081-2ba0d403e13d-1;Parent=29b6f676-e957-4508-9855-e0f904f60925;Name=KAF1903246.1;codon_start=1;gene=BDH2;gene_biotype=protein_coding;gene_id=4967ade5-6d91-faeb-79ed-e57093e4e5f2;gene_name=BDH2;locus_tag=GI526_G0000003;product=BDH2%20isoform%201;protein_id=KAF1903246.1;transcript_biotype=protein_coding;transcript_id=GI526_G0000003;transcript_name=BDH2;translation=MRALAYFGKGNIRFTNHLKEPHIVAPDELVIDIAWCGICGTDLHEYTDGPIFFPEDGHTHEISHNPLPQAMGHEMAGTVLEVGPSVKNLKVGDKVVVEPTGTCRDRYRWPLSPKVDKEWCAACKKGYYNICSYLGLCGAGVQSGGFAEGVVMNESHCYKVPDFVPLDVAALIQPLAVCWHAIRVCEFKAGSTALIIGAGPIGLGTILALNAAGCKDIVVSEPAKVRRELAEKMGARVYDPTAHAAKESIDYLRSIADGGDGFDYTFDCSGLEVTLNAAIQCLTFRGTAVNLAMWGHHKIQFSPMDITLHERKYTGSMCYTHHDFETVIEALEEGRIDIDRARHMITGRVNIEDGLDGAIMKLINEKESTIKIILTPNNHGELNREADNEKKEISELSSRKDQERLRESINEAKLRHT
CM021111.1 BioCantor gene 41086 42503 . + . ID=a67558d9-29e1-1b91-3a33-f5d7a57a9567;Name=BDH1;gene=BDH1;gene_biotype=protein_coding;gene_id=278a932c-a0a7-e31b-3156-5860ca4a4021;gene_name=BDH1;locus_tag=GI526_G0000004
CM021111.1 BioCantor transcript 41086 42503 . + . ID=64a7aecc-da53-7633-78b4-7fe59508aee1;Parent=a67558d9-29e1-1b91-3a33-f5d7a57a9567;Name=BDH1;gene=BDH1;gene_biotype=protein_coding;gene_id=278a932c-a0a7-e31b-3156-5860ca4a4021;gene_name=BDH1;locus_tag=GI526_G0000004;note=CAT%20transcript%20id:%20T0000004%3B%20CAT%20alignment%20id:%20NM_001178202.2-0%3B%20CAT%20source%20transcript%20id:%20NM_001178202.2%3B%20CAT%20source%20transcript%20biotype:%20protein_coding;transcript_biotype=protein_coding;transcript_id=GI526_G0000004;transcript_name=BDH1
CM021111.1 BioCantor exon 41086 42503 . + . ID=exon-64a7aecc-da53-7633-78b4-7fe59508aee1-1;Parent=64a7aecc-da53-7633-78b4-7fe59508aee1;Name=BDH1;gene=BDH1;gene_biotype=protein_coding;gene_id=278a932c-a0a7-e31b-3156-5860ca4a4021;gene_name=BDH1;locus_tag=GI526_G0000004;note=CAT%20transcript%20id:%20T0000004%3B%20CAT%20alignment%20id:%20NM_001178202.2-0%3B%20CAT%20source%20transcript%20id:%20NM_001178202.2%3B%20CAT%20source%20transcript%20biotype:%20protein_coding;transcript_biotype=protein_coding;transcript_id=GI526_G0000004;transcript_name=BDH1
CM021111.1 BioCantor gene 42580 43218 . + . ID=6ada04b7-6dac-ee62-7083-bd4521f34c41;Name=ECM1;gene=ECM1;gene_biotype=ncRNA;gene_id=e8b37537-588b-43a2-eb26-c88883f06014;gene_name=ECM1;locus_tag=GI526_G0000005
CM021111.1 BioCantor transcript 42580 43218 . + . ID=2bd1af5e-472d-09ae-78fc-77e26f7b5780;Parent=6ada04b7-6dac-ee62-7083-bd4521f34c41;Name=ECM1;gene=ECM1;gene_biotype=ncRNA;gene_id=e8b37537-588b-43a2-eb26-c88883f06014;gene_name=ECM1;locus_tag=GI526_G0000005;transcript_biotype=ncRNA;transcript_id=GI526_G0000005;transcript_name=ECM1
CM021111.1 BioCantor exon 42580 43218 . + . ID=exon-2bd1af5e-472d-09ae-78fc-77e26f7b5780-1;Parent=2bd1af5e-472d-09ae-78fc-77e26f7b5780;Name=ECM1;gene=ECM1;gene_biotype=ncRNA;gene_id=e8b37537-588b-43a2-eb26-c88883f06014;gene_name=ECM1;locus_tag=GI526_G0000005;transcript_biotype=ncRNA;transcript_id=GI526_G0000005;transcript_name=ECM1
GFF3 with FASTA
In addition to being able to print GFF directly, convenience functions exist to export GFF3 in one go, and optionally include sequence info.
[ ]:
# this does not work because it was parsed without sequence information
from inscripta.biocantor.io.gff3.writer import collection_to_gff3
with open("/dev/null", "w") as fh:
collection_to_gff3([parsed], fh, add_sequences=True)
---------------------------------------------------------------------------
GFF3ExportException Traceback (most recent call last)
<ipython-input-8-6616014c2f77> in <module>()
3
4 with open("/dev/null", "w") as fh:
----> 5 collection_to_gff3([parsed], fh, add_sequences=True)
/Users/ian.fiddes/repos/biocantor/inscripta/biocantor/io/gff3/writer.py in collection_to_gff3(collections, gff3_handle, add_sequences, ordered, chromosome_relative_coordinates, raise_on_reserved_attributes)
60 for collection in collections:
61 if collection.sequence is None:
---> 62 raise GFF3ExportException("Cannot export FASTA in GFF3 if collection has no associated sequence")
63 print(
64 GFF3Headers.SEQUENCE_HEADER.value.format(
GFF3ExportException: Cannot export FASTA in GFF3 if collection has no associated sequence
[ ]:
# parse the GFF3 with sequence instead this time and write to disk
from inscripta.biocantor.io.gff3.parser import parse_gff3_embedded_fasta
with open("/dev/null", "w") as fh:
parsed_with_sequence = [x.to_annotation_collection() for x in parse_gff3_embedded_fasta(gff3)]
collection_to_gff3(parsed_with_sequence, fh, add_sequences=True)
JSON
Each object also has a to_dict()
function, which produces a dict that the marshmallow
library understands. As a result, the below two operations are identical.
[ ]:
AnnotationCollectionModel.Schema().load(parsed.to_dict()).to_annotation_collection().to_dict() == parsed.to_dict()
True
However, the below is not true, only because the marshmallow schemas are Ordered
, and so produced OrderedDict
:
[ ]:
parsed.to_dict() == AnnotationCollectionModel.Schema().dump(model)
False
BED
BED export is only valid on TranscriptInterval
and FeatureInterval
objects, because BED format does not model relationships between rows. All models are exported in BED12
format.
[ ]:
for gene_or_feature_collection in parsed:
for transcript_or_feature in gene_or_feature_collection:
print(transcript_or_feature.to_bed12())
CM021111.1 16174 18079 GI526_G0000001 0 - 0 0 0,0,0 1 1905 0
CM021111.1 37461 39103 GDH3 0 + 37637 39011 0,0,0 1 1642 0
CM021111.1 39518 40772 BDH2 0 + 39518 40772 0,0,0 1 1254 0
CM021111.1 41085 42503 BDH1 0 + 0 0 0,0,0 1 1418 0
CM021111.1 42579 43218 ECM1 0 + 0 0 0,0,0 1 639 0
GenBank
All models can be exported to GenBank. GenBank export must be specified to be in either prokaryotic or eukaryotic flavors. See the document on parsing GenBank files for an explanation of the difference.
GenBank export is problematic for genomes that have multiple isoforms per gene due to the lack of the ability to explicitly define the hierarchical relationship. BioCantor GenBank is always locus sorted, which helps resolve this ambiguity.
GenBank export also the ability to export GenBank files compatible with Inscripta Engineering Portal. This mode of export ensures that there is always a unique /gene
tag on every feature, and that CDS
features have a /translation
tag.
The organism
and source
fields can be set by keyword arguments.
GenBank files can also be exported in two common GenBank file flavors, prokaryotic and eukaryotic. Eukaryotic GenBank files contain a mRNA
feature as a child of a gene
feature and parent of a CDS
feature, while Prokaryotic GenBank files skip the mRNA
feature and only have gene
and CDS
. The GenBank writing function defaults to the prokaryotic version, but this can be adjusted by passing genbank_type=GenbankFlavor.EUKARYOTIC
.
[ ]:
from inscripta.biocantor.io.genbank.writer import collection_to_genbank
with open("/dev/null", "w") as fh:
collection_to_genbank([parsed], fh)
---------------------------------------------------------------------------
GenBankExportError Traceback (most recent call last)
<ipython-input-16-5b5bfd3a0aaa> in <module>()
2
3 with open("/dev/null", "w") as fh:
----> 4 collection_to_genbank([parsed], fh)
/Users/ian.fiddes/repos/biocantor/inscripta/biocantor/io/genbank/writer.py in collection_to_genbank(collections, genbank_file_handle_or_path, genbank_type, force_strand, organism, source, seqrecord_annotations, update_translations)
83
84 if collection.sequence is None:
---> 85 raise GenBankExportError("Cannot export GenBank if collections do not have sequence information")
86
87 seqrecord = SeqRecord(
GenBankExportError: Cannot export GenBank if collections do not have sequence information
[ ]:
from tempfile import TemporaryDirectory
from pathlib import Path
with TemporaryDirectory() as tmp_dir:
tmp_file = Path(tmp_dir) / "test.gbk"
with open(tmp_file, "w") as fh:
collection_to_genbank(parsed_with_sequence, tmp_file)
with open(tmp_file, "r") as fh:
print(fh.read()[:2000])
LOCUS CM021111.1 50040 bp DNA UNK 01-JAN-1980
DEFINITION GenBank produced by BioCantor.
ACCESSION CM021111
VERSION CM021111.1
KEYWORDS .
SOURCE .
ORGANISM .
.
FEATURES Location/Qualifiers
gene complement(16175..18079)
/gene_id="8ad3f444-384e-35e0-e560-aef88bd2863f"
/gene_biotype="ncRNA"
/locus_tag="GI526_G0000001"
/gene="8ad3f444-384e-35e0-e560-aef88bd2863f"
ncRNA complement(16175..18079)
/ncrna_class="other"
/note="CAT transcript id: T0000001; CAT alignment id:
IsoSeq-PB.2586.1; CAT novel prediction: IsoSeq"
/transcript_id="GI526_G0000001"
/transcript_name="GI526_G0000001"
/transcript_biotype="ncRNA"
/gene="8ad3f444-384e-35e0-e560-aef88bd2863f"
/locus_tag="GI526_G0000001"
gene 37462..39103
/gene="GDH3"
/gene_id="a1b669f1-57f6-ae9b-8f4f-a27a6e84d15a"
/gene_name="GDH3"
/gene_biotype="protein_coding"
/locus_tag="GI526_G0000002"
CDS 37638..39011
/codon_start=1
/gene="GDH3"
/note="CAT transcript id: T0000002; CAT alignment id:
NM_001178204.1-0; CAT source transcript id: NM_001178204.1;
CAT source transcript biotype: protein_coding"
/translation="MTSEPEFQQAYDEIVSSVEDSKIFEKFPQYKKVLPIVSVPERIIQ
FRVTWENDNGEQEVAQGYRVQFNSAKGPYKGGLRFHPSVNLSILKFLGFEQIFKNALTG
LDMGGGKGGLCVDLKGKSDNEIRRICYAFMRELSRHIGKDTDVPAGDIGVGGREIGYLF
GAYRSYKNSWEGVLTGKGLNWGGSLIRPEATGFGLVYYTQAMIDYATNGKESFEGKRVT
ISGSGNVAQ
[ ]: