https://www.youtube.com/watch?v=-0Xur_HtHaM&list=PLaE61CK5r6_l2fxVp3r3OP0fgTSTdQUoQ
================================================================================
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
# c a: default sequence string
a=Seq("AGTCAGTC")
print("a",a)
================================================================================
# c a: DNA sequence string
a=Seq("AGTCAGTC",IUPAC.unambiguous_dna)
print("a",a)
a.complement
a.reverse_complement
================================================================================
================================================================================
IUPAC: institute which defines "alphabets"
================================================================================
IUPAC: definition for 20 amino acids (in form of alphabet)
ExtendedIUPACProtein:
some amino acids are added in alphabet form
================================================================================
For nucleotide,
- IUPACUnambiguousDNA
- IUPACAmbiguousDNA:
one alphabet which indicates over-2 bases, for example, alphabet_1 indicates to both A and G
- ExtendedIUPACDNA
- B: 5-bromouridine
- D: 5,6-dihydrouridine
- S: Thiouridine
- W: wyosine
- IUPACAmbiguousDNA
- IUPACAmbiguousRNA
================================================================================
SeqRecord object
Attributes
- .seq: Seq object
- .id: ID which is used to identify the sequence
- .name: title string
- .description: human readable description
- .letter_annotations: dict()
- .annotations: dict()
- .features: list of SeqFeature objects
- .dbxrefs: list of database cross-reference
================================================================================
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
simple_seq=Seq("AGTC")
simple_seq_r=SeqRecord(simple_seq)
================================================================================
SeqFeature object
Attributes
- .type: CDS, gene
- .location
- .qualifiers: dict(), additional information
- .sub_features: list of sub SeqFeatures objects
================================================================================
Download file
================================================================================
================================================================================
SeqIO
- For sequence input and output
from Bio import SeqIO
record_iterator=SeqIO.parse("input.gb","gb")
for record in record_iterator:
print(record.format("fasta"))
# Or you can also use
# SeqIO.write(record,"outputfilename","fasta")
================================================================================
gi=SeqIO.parse("sequence.gb","gb")
record=gi.next()
record.id
record.name
record.description
record.annotations
f=record.features
f.type
# source
f.location
================================================================================
print('record.format("gb")',record.format("gb"))
# gene bank format
print('record.format("fasta")',record.format("fasta"))
# fasta format
print('record.format("embl")',record.format("embl"))
# embl format
================================================================================
Formats which you can use
================================================================================
Quiz
- Read multiple FASTA files
- Write reverse complement in FASTA format
import sys
from Bio import SeqIO
for record in SeqIO.parse(sys.stdin,"fasta"):
record.seq=record.seq.reverse_complement()
record.description=record.description+" reverse complemented"
print(record.format("fasta"))
# print(record.format("gb "))
# Error: fasta ---> gb can't be possible
# because fasta has some data which gb doesn't have
================================================================================
How to use Entrez (which is the name of NCBI database)
BioPython's Entrez is for getting SeqRecord (data) from NCBI
from Bio import Entrez
Entrez.email="yours@yours.com"
print(Entrez.einfo().read())
All kinds of DB of Entrez, which you can use
================================================================================
from Bio import Entrez
Entrez.email="yours@yours.com"
handle=Entrez.esearch(db="pubmed",term="biopython")
record=Entrez.read(handle)
# PubMed's IDs
# Each one is thesis paper
record["IdList"]
================================================================================
# Organism: Cypripedioidease
# Gene name: matK
handle=Entrez.esearch(db="nucleotide",term="Cypripedioidease[Organ] AND matK[Gene]")
record=Entrez.read(handle)
record["Count"]
record["IdList"]
================================================================================
from Bio import Entrez
Entrez.email="yours@yours.com"
handle=Entrez.efectch(db="nucleotide",id="186972394",rettype="gb",retmode="text")
record=SeqIO.read(handle,"genbank")
================================================================================
from Bio import Entrez
Entrez.email="yours@yours.com"
handle=Entrez.esearch(db="pubmed",term="biopython")
record=Entrez.read(handle)
record["IdList"]
idlist=record["IdList"]
handle=Entrez.efectch(db="pubmed",id=idlist,rettype="medline",retmode="text")
records=Medline.parse(handle)
for record in records::
print(record["AU"])