Python & BioPython

Python & BioPython

#!/usr/bin/env python

import os
import textdistance
import hashlib
import subprocess
from Bio import SeqIO
from collections import defaultdict
from Bio.SeqRecord import SeqRecord


# Make a fasta file from tab delimited file
def tab_to_fasta(tabFile):
    """Make a fasta file from tab delimited file.
    Args:
        input(tab): file: tab delimited file

    Returns:
        output(str): file: fasta
    """
    outputfile = tabFile.split(".")[0] + ".fasta"
    SeqIO.convert(tabFile, 'tab', outputfile, 'fasta')



# Convert fastq to fasta
def fastq_to_fasta(fastqFile):
    """Convert fastq to fasta.
    Args:
        input(str): file: fastq

    Returns:
        output(str): file: fasta
    """
    parameters = ["seqtk", "seq", "-a", fastqFile]
    outputfile = fastqFile.split(".")[0] + ".fasta"
    with open(outputfile, "a") as f:
        run = subprocess.Popen(parameters, stdout=f)


def run_vsearch(barcodefile, reads, cluster_id=0.75):
    """ Takes in fasta file and run vsearch.
    Args:
        input1(str): barcodefile: fasta
        input2(str): reads: fastafile
        input3 (int): cluster_id
    Returns:
    output: file: vsearch output containing alignment positon and quality

    """
    try:
        out_info = 'query+target+ql+tl+id+tcov+qcov+ids+gaps+qrow+trow+id4+qilo+qihi+qstrand+tstrand'
        outputfile = barcodefile.split(".")[0] + "__output.txt"
        parameters = ["vsearch", "--usearch_global", reads, "--db", barcodefile,
                      "--id", str(cluster_id), "--userfield", out_info, "--strand", "both", "--userout", outputfile]
        p0 = subprocess.run(parameters, stderr=subprocess.PIPE)
        print(p0.stderr.decode('utf-8'))
    except subprocess.CalledProcessError as e:
        print(str(e))
        
    def head_dict(dict, n):
    """Print first 'n ~ size' number of entries in a dictionary.

    Args:
        input (str): dict: the input dictionary to be parsed

        input (int): n: interger specifying the size of return dictionary

    Returns:
        print n entries for dictionary
    """
    k = 0
    for items in dict.items():
        if k < n:
            print("\n", items)
            k += 1


def sample_dict(dict, n):
    """Sample first 'n ~ size' number of entries in a dictionary.

    Args:
        input (str): dict: the input dictionary to be parsed

        input (int): n: interger specifying the size of return dictionary

    Returns:
        (dict): A dictionary of given size
    """
    tinydict = {}
    npeak = list(islice(dict, n))
    for key in dict:
        if key in npeak:
            tinydict[key] = dict[key]
    return tinydict
    
    
def get_edit_distance(file, barcodefile):
    """Compute edit distances between sequences.

    Args:
        input1 (str): file: fasta file containing reads.

        input2 (str): barcodefile: other fasta file containing reads.

    Returns:
        (list): A list with read_id for input1, read_id for input2, edit_distance, and 
        number of matches.
    """
    edist = []
    with open(file, "r") as f1:
        for r_rec in SeqIO.parse(f1, "fasta"):
            with open(barcodefile, "r") as f2:
                for b_rec in SeqIO.parse(f2, "fasta"):
                    ref = r_rec.seq
                    refid = r_rec.id
                    bar = b_rec.seq
                    barid = b_rec.id
                    sm = edit_distance.SequenceMatcher(ref, bar)
                    ed = sm.distance()
                    match = sm.matches()
                    to_rec = [refid, barid, ed, match]
                    edist.append(to_rec)
        return edist


def get_hamming_distance(fastafile, bardict, filterdict):
    """Compute hamming distances between sequences.

    Args:
        input1 (str): file: fasta file containing reads.

        input2 (str): barcodefile: other fasta file containing reads.

    Returns:
        (list): An updated filterdict with hamming distance for N5 index
    """
    for seq_record in SeqIO.parse(fastafile, "fasta"):
        if seq_record.id in filterdict.keys():
            item = str(seq_record.seq)
            seq_key = seq_record.id
            for v in bardict.values():
                bar_key = v.id
                bar_seq = str(v.seq)
                if bar_key in filterdict[seq_key]:
                    if bar_key.startswith("IP"):
                        s_p = filterdict[seq_key][bar_key]['spos']
                        e_p = filterdict[seq_key][bar_key]['epos']
                        cutseq = item[s_p: e_p][-8:]
                        ham_dist = int(textdistance.hamming(cutseq, bar_seq))
                        filterdict[seq_key][bar_key]['hamDist'] = ham_dist
                        print(cutseq, bar_key, ham_dist)
    return filterdict



def cluster(rbfasta, threads=1, cluster_id=0.5, min_seqlength=10):
    """Runs Vsearch clustering to create a FASTA file of non-redundant sequences. 
    Selects the most abundant sequence as the centroid.
    Args:
        threads (int or str):the number of processor threads to use

    Returns:
            (file): uc file with cluster information
            (file): a centroid fasta file
    """
    try:
        centroid_fasta = rbfasta.split(".")[0] + "_centroid.fasta"
        uc_file = rbfasta.split(".")[0] + ".uc"
        parameters = ["vsearch",
                      "--cluster_size", rbfasta,
                      "--centroids", centroid_fasta,
                      "--uc", uc_file,
                      "--strand", "both",
                      "--minseqlength", str(min_seqlength),
                      "--id", str(cluster_id),
                      "--xsize",
                      "--sizeout",
                      "--threads", str(threads)]
        p1 = subprocess.run(parameters, stderr=subprocess.PIPE)
        print(p1.stderr.decode('utf-8'))
    except subprocess.CalledProcessError as e:
        print(str(e))
    except FileNotFoundError as f:
        print(str(f))


def bb_maping(fastafile, ref, minid_value=0.65):
    """Map reads to the reference genome
    Args:
        input1(str): file: clean fasta file
        input2 (str): file: reference genome - fasta file
        input3 (int): minid value for bbmap.sh
    Returns:
        file1(samfile): a samfile with information for mapped reads.
        file2(bashscript): a bashscript file for convert sam to sorted bam
    """
    try:
        infile = "in=" + fastafile.split(".")[0] + ".fasta"
        outfile = "outm=" + fastafile.split(".")[0] + ".mapped.sam"
        reffile = "ref=" + ref.split(".")[0] + ".fna"
        mv = "minid=" + str(minid_value)
        parameters = ["bbmap.sh",
                      infile,
                      outfile,
                      reffile,
                      "overwrite=t", mv,
                      "bamscript=bamscript.sh",
                      "nodisk",
                      "qtrim=r",
                      "local=t"]
        p2 = subprocess.run(parameters, stderr=subprocess.PIPE)
        print(p2.stderr.decode('utf-8'))
    except subprocess.CalledProcessError as e:
        print(str(e))
    except FileNotFoundError as f:
        print(str(f))


def graph_map(infile, rfile):
    """Map reads to the reference genome
    Args:
        input1(str): file: clean fasta file
        input2(str): file: reference genome - fasta file
    Returns:
        file(samfile): a samfile
    """
    try:
        ref = rfile.split(".")[0] + ".fna"
        fasta = infile.split(".")[0] + ".fasta"
        outfile = infile.split(".")[0] + ".sam"
        parameters = ['graphmap', 'align',
                      '-r', ref,
                      '-d', fasta,
                      '-o', outfile]
        p4 = subprocess.run(parameters, stderr=subprocess.PIPE)
        print(p4.stderr.decode('utf-8'))
    except subprocess.CalledProcessError as e:
        print(str(e))
    except FileNotFoundError as f:
        print(str(f))




Avatar
Ravin Poudel
Computational Biologist (PostDoc)

Related