Question:: How to extract specific sequences from a big FASTA file?
Getting random access to a fasta or sequene file is often needed for bioinformatics analyses. Multiple random access to a large fasta file could slow the process a lot. Compute time is worst with the fasta file of large size. Following are some of the techniques that could be useful, depending upon the size of fasta file- from few mb to few gb.
1) Using SeqIO.index from Biopython. - works great for small genome - e.g. bacterial genomes
from Bio import SeqIO
import pandas as pd
# input fasta file
fastafile = "bacterialgenome.fasta"
# Suppose, we have a list with sequence header of intesst, that we want to subset.
seqoi = set(pd.read_csv ('goi.csv')['fsrA'].to_list())
# index fasta file, allows for fast access
record_dict = SeqIO.index(fastafile, "fasta")
# retrive reads/seqrecord from fasta file, based on header in seqoi
sample_fasta = []
for header in seqoi:
sample_fasta.append(record_dict[header])
# write down as a fasta file
SeqIO.write(sample_fasta, "subset.fasta" , "fasta")
2) faidx or pyfaidx – indexing takes a lot of time, seems to use only single thread, works great for small genomes.
# bash one liner
xargs faidx bacterialgenome.fasta < list.txt > subset.fasta
OR in python
from Bio import SeqIO
from pyfaidx import Fasta
from Bio.SeqRecord import SeqRecord
allreads = Fasta('bacterialgenome.fasta')
file = open("goi.csv", "r")
roi = file.read().splitlines() # read of interest
roi_list = []
for header in allreads.keys():
if header in roi:
record = allreads[header]
seq = Seq(str(record))
seqid = str(record.long_name)
roi_list.append(SeqRecord(seq = seq, id = seqid,description ="", name=""))
SeqIO.write(roi_list, "roi.fasta", "fasta")
3) samtools faidx – indexing takes a lot of time, seems to use multiple threads, works great for small to mid size file
xargs samtools faidx bacterialgenome.fasta < list.txt > subset.fasta
4) Suppose, you have large fasta file (~ 100GB) - such as all the reads combined from metagenomics project, and you are interested in retriving few 100s of sequening by passing header name. Here we are using blast to solve this issue.
# Make blast db using large fasta file as input - indexing takes some time, but after that it is fast.
makeblastdb -dbtype nucl -out largeDB -in large.fasta -parse_seqids
## NOTE: Still not figure out- limitation in the character of header -- has to be less than 50.
# Search against the db
blastdbcmd -db largeDB -entry 'CSM79HLA_TR.c0000000002_1741_3253'
# 4.3) How to pass a list? -- just loop each entry from list.txt
for header in $(cat list.txt);do
blastdbcmd -db largeDB -entry $header >> subset.fasta
done
Question/Task:: Clinical Genomics
Design a targeted panel for sequencing all coding exons of human genes BRCA1 and BRCA2 (HGNC names given), and provide genomic coordinate ranges (as a BED- format file) that encompass all bases in those coding exons to the panel manufacturer. The ranges must be disjoint (no overlap, and at least one genomic base must separate adjacent intervals). Such output will be used by the manufacturer to design PCR primer pairs that amplify these targets out of genomic DNA for downstream NGS.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri May 22 18:15:20 2020
@author: ravinpoudel
"""
import pyfaidx
import pandas as pd
import pybedtools
from collections import deque
from pyensembl import EnsemblRelease
# retrive HGNC - release 77 uses human reference genome GRCh38
ensembldata = EnsemblRelease(77)
# index human genome. Indexning allows for fasta random access to the genome.
# this will speed things when we need to retrive actual nucleotide sequnce for gene cordinates
refidx = pyfaidx.Faidx('reference/Homo_sapiens.GRCh38.dna.toplevel.fa')
genename ='BRCA1'
def getbed4exon(ensembldata, refidx, genename):
"""Provide non-overlapping genomic coordinate ranges (as a BED- format file with and without sequnces)
for a gene """
## get exon ids for geneid
exon_ids = ensembldata.exon_ids_of_gene_name(genename)
# Get locus for each exon ids
exon_locus = deque()
for i in exon_ids:
record = ensembldata.locus_of_exon_id(i)
exn = {'contig' : record.contig,
'start' : record.start,
'end' : record.end,
'length' : record.length,
'strand' : record.strand}
exon_locus.append(exn)
# list to datafram
df = pd.DataFrame(exon_locus)
# covert to bed then sort
df_bed = pybedtools.BedTool.from_dataframe(df).sort()
## remove overlapping ranges
df_bed_merged = df_bed.merge()
## add sequence to the exon_locus
ex_seq = deque()
for i in df_bed_merged:
contig = i[0]
start = int(i[1])
end = int(i[2])
length = (end - start) + 1
exon_seq = str(fa.fetch(contig, start, end))
record = {'contig': contig,
'start': start,
'end': end,
'seq': exon_seq,
'length': length,
'gene': genename}
ex_seq.append(record)
## export as bed file
final_exon = pd.DataFrame(ex_seq)
final_bed = pybedtools.BedTool.from_dataframe(final_exon)
# final_bed.saveas('final.bed')
print('Number of unoverlapping record in the bedfile for', genename,'::', len(final_bed))
return (df_bed_merged, final_bed)
##
bed_brca1_cordinates, bed_brca1_cordinates_withsequence = getbed4exon(ensembldata, refidx, 'BRCA1')
bed_brca2_cordinates, bed_brca2_cordinates_withsequence = = getbed4exon(ensembldata, refidx, 'BRCA2')
NCBI and Python
Using NCBI API's in python to get data.
https://www.ncbi.nlm.nih.gov/datasets/docs/languages/python/
# Modules from the Python Standard Library:
# "Regular Expressions" for searching and replacing text
import re
# creating temporary files to hold intermediate results that we don't care about saving
from collections import defaultdict
# stands for Input/Output
from io import BytesIO, StringIO
# specifying the locations of files on our computer
from pathlib import Path
# reading and writing zip files
from zipfile import ZipFile
# Modules from other packages:
# Matplotlib: plotting figures
import matplotlib.pyplot as plt
# BioPython: reading and writing sequence files,
# manipulating and aligning sequences, making phylogenetic trees
from Bio import AlignIO, Phylo, SeqIO
from Bio.Align.Applications import MuscleCommandline
from Bio.Phylo.TreeConstruction import (
NNITreeSearcher,
ParsimonyScorer,
ParsimonyTreeConstructor,
)
from Bio.SeqRecord import SeqRecord
# NCBI Datasets: searching and downloading NCBI data
from ncbi.datasets.openapi import GeneApi
#############
# NCBI Datasets: searching and downloading NCBI data
from ncbi.datasets.openapi import GeneApi
taxon = "human"
symbols = ["MB", "BRCA1", "TP53"]
# Getting gene IDs from NCB
gene_api = GeneApi()
print(type(gene_api))
# Downloading metadata from NCBI¶
metadata = gene_api.gene_metadata_by_tax_and_symbol(symbols, taxon)
# Unpacking information from the metadata object
print(type(metadata.genes))
for gene_data in metadata.genes:
print(gene_data.gene.description)
for gene_data in metadata.genes:
symbol = gene_data.gene.symbol
gene_id = gene_data.gene.gene_id
print(f"Symbol: {symbol}\tId: {gene_id:>4}")
## making a function
def get_gene_ids(symbols, taxon):
gene_api = GeneApi()
gene_metadata = gene_api.gene_metadata_by_tax_and_symbol(symbols, taxon)
gene_id_dict = {
gene_data.gene.symbol: int(gene_data.gene.gene_id)
for gene_data in gene_metadata.genes
}
return gene_id_dict
gene_symbols = ["MB", "BRCA1", "TP53"]
gene_ids = get_gene_ids(gene_symbols, "human")
print(gene_ids)
# Finding whale myoglobin orthologs
def query_orthologs(gene_id, taxa):
gene_api = GeneApi()
response = gene_api.gene_orthologs_by_id(gene_id, taxon_filter=taxa)
return [item.gene for item in response.genes.genes]
taxa = ["whales", "human", "Macaca mulatta"]
mb_orthologs = query_orthologs(gene_ids["MB"], taxa)
print(mb_orthologs)
# How many orthologs did we find?
print(len(mb_orthologs))
for g in mb_orthologs:
print(f"{g.common_name}: {g.taxname}")
ortholog_gene_ids = [int(g.gene_id) for g in mb_orthologs]
print(ortholog_gene_ids)
# Downloading gene sequences¶
def download_transcripts(gene_ids, data_directory):
# Create a GeneApi object
gene_api = GeneApi()
print("Begin download of data package ...")
# Use the GeneApi to download a fasta file of gene transcripts.
gene_ds_download = gene_api.download_gene_package(
gene_ids, include_annotation_type=["FASTA_RNA"], _preload_content=False
)
# The downloaded data package is formatted as a zip file.
# Extract the fasta file and write it to your hard drive.
with ZipFile(BytesIO(gene_ds_download.data)) as zipfile:
data_file_name = zipfile.extract(
"ncbi_dataset/data/rna.fna", path=data_directory
)
print(f"Download completed -- see {data_file_name}")
# Return the path to the fasta file you just downloaded.
return Path(data_file_name)
# apply
# We're going to put all our data in a directory called `data`
data_dir = Path("../data")
# Download and save the name of the fasta file.
ortholog_fasta = download_transcripts(ortholog_gene_ids, data_dir)
print("Data directory contents after download:\n")
! ls -R {data_dir}
# Reading and manipulating sequence records¶
# Get a list of sequences from the fasta file
ortholog_records = list(SeqIO.parse(ortholog_fasta, "fasta"))
# Print each sequence record
for record in ortholog_records:
print(record, "\n")
# function to get the organism name from of a record:
def get_organism_name(record):
# Use re.search to match the pattern and capture the result.
match = re.search(r"\[organism=(?P<name>[\w ]+)\]", record.description)
# If there's a match, return the organism name.
if match:
return match.group("name")
# If not, indicate that no name was found.
else:
return "OrganismNameNotFound"
for rec in ortholog_records:
print(get_organism_name(rec))
###
def records_by_organism(records):
# defaultdict is like a dict but with a default value for missing keys
# here the default is an empty list, i.e., no records
organism_dict = defaultdict(list)
for record in records:
org = get_organism_name(record)
# Add the record to the list for its organism
organism_dict[org].append(record)
return organism_dict
organism_records = records_by_organism(ortholog_records)
# the items method gives a list of key-value pairs
# we can loop over them simultaneously
for org, records in organism_records.items():
print(f"{org}: {len(records)}")
counts = [len(records) for records in organism_records.values()]
plt.hist(counts, bins=range(1, 10))
plt.xlabel("Number of records")
plt.ylabel("Number of species")
plt.show()
transcript_lengths = [
len(record.seq) for records in organism_records.values() for record in records
]
plt.hist(transcript_lengths)
plt.xlabel("Transcript length")
plt.ylabel("Number of transcripts")
plt.show()
def record_length(rec):
return len(rec.seq)
def get_longest_transcripts(record_dict):
return {
org: max(records, key=record_length)
for org, records in organism_records.items()
}
longest_transcripts = get_longest_transcripts(organism_records)
for org, rec in longest_transcripts.items():
print(org, rec.id)
# Extracting the coding sequence
print(mb_orthologs)
def cds_region(transcript):
# The range object is a list to account for the possibility
# of multiple ranges. `[0]` takes the first one.
cds_range = transcript.cds.range[0]
# `(a, b)` is a pair of values. We're returning two things: begin and end
return (int(cds_range.begin), int(cds_range.end))
def get_cds_regions(gene_list):
return {
transcript.accession_version: cds_region(transcript)
# Using multiple `for`s loops over nested lists:
for gene in gene_list
for transcript in gene.transcripts
}
cds_regions = get_cds_regions(mb_orthologs)
print(cds_regions)
def get_cds_records(transcript_dict, cds_regions):
cds_records = []
for organism, record in transcript_dict.items():
# 1.
start, end = cds_regions[record.id]
# 2.
cds_record = SeqRecord(
# 3.
id=organism.replace(" ", "_"),
name=record.name,
description=record.description,
# 4.
seq=record.seq[start - 1 : end],
)
cds_records.append(cds_record)
return cds_records
cds_records = get_cds_records(longest_transcripts, cds_regions)
for rec in cds_records:
print(rec.seq.translate())
def check_start_codons(proteins):
return all([p.startswith("M") for p in proteins])
def check_stop_codons(proteins):
return all([p.endswith("*") for p in proteins])
protein_seqs = [rec.seq.translate() for rec in cds_records]
print(check_start_codons(protein_seqs))
print(check_stop_codons(protein_seqs))
# Creating a multiple sequence alignment and building a tree
cds_fasta = data_dir / "cds.fna"
SeqIO.write(cds_records, cds_fasta, "fasta")
# Check that we created a file `cds.fna` in our data directory
! ls {data_dir}
def align_with_muscle(input_fasta):
muscle_exe = Path("../bin/muscle3.8.31_i86linux64")
muscle_cline = MuscleCommandline(muscle_exe, input=input_fasta)
# The variable `stdout` ("standard out") captures the output from MUSCLE
# `stderr` ("standard error") captures any errors.
stdout, stderr = muscle_cline()
# `AlignIO` reads an alignment
# `StringIO` lets BioPython treat a string as though it were a file
return AlignIO.read(StringIO(stdout), "fasta")
alignment = align_with_muscle(cds_fasta)
print(alignment)
def build_parsimony_tree(alignment):
scorer = ParsimonyScorer()
searcher = NNITreeSearcher(scorer)
constructor = ParsimonyTreeConstructor(searcher)
return constructor.build_tree(alignment)
tree = build_parsimony_tree(alignment)
print(tree)
Phylo.draw_ascii(tree)
#Putting it all together
# Inputs:
gene_symbol = "MB"
reference_taxon = "human"
search_taxa = ["whales", "human", "Macaca mulatta"]
data_dir = Path("../data")
# Getting orthologs:
gene_ids = get_gene_ids([gene_symbol], reference_taxon)
orthologs = query_orthologs(gene_ids[gene_symbol], search_taxa)
ortholog_gene_ids = [int(g.gene_id) for g in orthologs]
ortholog_fasta = download_transcripts(ortholog_gene_ids, data_dir)
ortholog_records = list(SeqIO.parse(ortholog_fasta, "fasta"))
organism_records = records_by_organism(ortholog_records)
# Finding the longest transcript and extracting the CDS
longest_transcripts = get_longest_transcripts(organism_records)
cds_regions = get_cds_regions(orthologs)
cds_records = get_cds_records(longest_transcripts, cds_regions)
# Making an alignment:
cds_fasta = data_dir / "cds.fna"
SeqIO.write(cds_records, cds_fasta, "fasta")
alignment = align_with_muscle(cds_fasta)
# Building and plotting the tree:
tree = build_parsimony_tree(alignment)
Phylo.draw_ascii(tree)