modules.extract

  1import io, gzip, tempfile, shutil
  2import pysam
  3from dataclasses import dataclass, astuple
  4
  5'''
  6extract.py, handles logic of the 'extract' subcommand
  7'''
  8
  9@dataclass
 10class TsvLine:
 11    '''
 12    Each field needs to be a str, because they're '\t'.join't before writing to file.
 13    '''
 14    QNAME: str
 15    FLAGS: str
 16    RNAME: str
 17    POS: str
 18    CLIP5: str
 19    CLIP3: str
 20    SEQ: str
 21
 22def basic_write(
 23        in_bam: pysam.AlignmentFile,
 24        write_file: str,
 25        seq_switch: bool) -> None:
 26    """
 27    Basic function for writing the output tsv file, it's used for: buffered
 28    plain text tsv writing and gzip compressed tsv writing within plain_write()
 29    and gzip_write().
 30
 31    Args:
 32        in_bam (str): path to input BAM file
 33        write_file (str): name of the output file
 34        seq_switch (bool): controls whether SEQ column should contain whole read
 35            sequence or NA
 36
 37    Returns:
 38        None
 39    """
 40
 41    # write column headers to output file
 42    write_file.write(f"QNAME\tFLAGS\tRNAME\tPOS\tCLIP5\tCLIP3\tSEQ\n")
 43
 44    # iterate over bam lines
 45    for read in in_bam.fetch():
 46
 47        # get CIGAR string in tuple format
 48        # eg. (4, 7) is the same as 7S
 49        cigar = read.cigartuples
 50
 51        # initialize dataclass with row values, all values must be strings
 52        line = TsvLine(
 53            read.query_name,
 54            f"{read.flag}:{read.flag:012b}",
 55            read.reference_name,
 56            str(read.reference_start),
 57            read.query_sequence[:cigar[0][1]] if cigar[0][0] == 4 else "NA",
 58            read.query_sequence[:cigar[-1][1]] if cigar[-1][0] == 4 else "NA",
 59            read.query_sequence if seq_switch else "NA"
 60        )
 61
 62        # write row into the output tsv file
 63        write_file.write('\t'.join(astuple(line)) + '\n')
 64
 65    return None
 66
 67def plain_write(
 68        in_bam: str,
 69        out_name: str,
 70        seq_switch: bool=False
 71        ) -> None:
 72    """
 73    Writes the output into a plain text file
 74
 75    Args:
 76        in_bam (str): path to input BAM file
 77        out_name (str): name of the output file
 78        seq_switch (bool): controls whether the original read SEQ should be
 79            included
 80
 81    Returns:
 82        None
 83    """
 84    with io.open(out_name, 'w', buffering=524_288) as f:
 85        basic_write(in_bam, f, seq_switch)
 86
 87    return None
 88
 89def gzip_write(
 90        in_bam: str,
 91        out_name: str,
 92        gzip_level: int,
 93        seq_switch: bool=False) -> None:
 94    """
 95    Writes the output into a gzip compressed plain text file
 96
 97    Args:
 98        in_bam (str): path to input BAM file
 99        out_name (str): name of the output file
100        c_level (int): gzip compression level, default argparse value sets it to 1
101        seq_switch (bool): controls whether the original read SEQ should be
102            included
103
104    Returns:
105        None
106    """
107    with gzip.open(out_name, "wt", compresslevel=gzip_level) as f:
108        basic_write(in_bam, f, seq_switch)
109
110    return None
111
112def extractor(in_bam: str,
113              out_name: str,
114              seq_switch: bool,
115              gzip_switch: bool,
116              gzip_level: int) -> None:
117    '''
118    Function for handling the main logic of 'extractor' subcommand
119
120    Args:
121        in_bam (str): input BAM path
122        out_name (str): name of output TSV
123        seq_switch (bool): controls whether to write the original SEQ to the
124            TSV
125        gzip_switch (bool): controls whether to gzip the output TSV
126        gzip_level (int): gzip compression level, default is 2 (set in cli.py
127            parsing module)
128
129    Returns:
130        None
131    '''
132    # open input BAM for reading
133    bamfile = pysam.AlignmentFile(in_bam, "rb")
134
135    # write to tsv.gz or tsv
136    if gzip_switch:
137        gzip_write(
138            in_bam=bamfile,
139            out_name=f"{out_name}.tsv.gz",
140            seq_switch=seq_switch,
141            gzip_level=gzip_level
142        )
143
144    else:
145        plain_write(
146            in_bam=bamfile,
147            out_name=f"{out_name}.tsv",
148            seq_switch=seq_switch
149        )
150
151        # close input BAM
152        bamfile.close()
153
154    return None
@dataclass
class TsvLine:
10@dataclass
11class TsvLine:
12    '''
13    Each field needs to be a str, because they're '\t'.join't before writing to file.
14    '''
15    QNAME: str
16    FLAGS: str
17    RNAME: str
18    POS: str
19    CLIP5: str
20    CLIP3: str
21    SEQ: str

Each field needs to be a str, because they're ' '.join't before writing to file.

TsvLine( QNAME: str, FLAGS: str, RNAME: str, POS: str, CLIP5: str, CLIP3: str, SEQ: str)
QNAME: str
FLAGS: str
RNAME: str
POS: str
CLIP5: str
CLIP3: str
SEQ: str
def basic_write( in_bam: pysam.libcalignmentfile.AlignmentFile, write_file: str, seq_switch: bool) -> None:
23def basic_write(
24        in_bam: pysam.AlignmentFile,
25        write_file: str,
26        seq_switch: bool) -> None:
27    """
28    Basic function for writing the output tsv file, it's used for: buffered
29    plain text tsv writing and gzip compressed tsv writing within plain_write()
30    and gzip_write().
31
32    Args:
33        in_bam (str): path to input BAM file
34        write_file (str): name of the output file
35        seq_switch (bool): controls whether SEQ column should contain whole read
36            sequence or NA
37
38    Returns:
39        None
40    """
41
42    # write column headers to output file
43    write_file.write(f"QNAME\tFLAGS\tRNAME\tPOS\tCLIP5\tCLIP3\tSEQ\n")
44
45    # iterate over bam lines
46    for read in in_bam.fetch():
47
48        # get CIGAR string in tuple format
49        # eg. (4, 7) is the same as 7S
50        cigar = read.cigartuples
51
52        # initialize dataclass with row values, all values must be strings
53        line = TsvLine(
54            read.query_name,
55            f"{read.flag}:{read.flag:012b}",
56            read.reference_name,
57            str(read.reference_start),
58            read.query_sequence[:cigar[0][1]] if cigar[0][0] == 4 else "NA",
59            read.query_sequence[:cigar[-1][1]] if cigar[-1][0] == 4 else "NA",
60            read.query_sequence if seq_switch else "NA"
61        )
62
63        # write row into the output tsv file
64        write_file.write('\t'.join(astuple(line)) + '\n')
65
66    return None

Basic function for writing the output tsv file, it's used for: buffered plain text tsv writing and gzip compressed tsv writing within plain_write() and gzip_write().

Args: in_bam (str): path to input BAM file write_file (str): name of the output file seq_switch (bool): controls whether SEQ column should contain whole read sequence or NA

Returns: None

def plain_write(in_bam: str, out_name: str, seq_switch: bool = False) -> None:
68def plain_write(
69        in_bam: str,
70        out_name: str,
71        seq_switch: bool=False
72        ) -> None:
73    """
74    Writes the output into a plain text file
75
76    Args:
77        in_bam (str): path to input BAM file
78        out_name (str): name of the output file
79        seq_switch (bool): controls whether the original read SEQ should be
80            included
81
82    Returns:
83        None
84    """
85    with io.open(out_name, 'w', buffering=524_288) as f:
86        basic_write(in_bam, f, seq_switch)
87
88    return None

Writes the output into a plain text file

Args: in_bam (str): path to input BAM file out_name (str): name of the output file seq_switch (bool): controls whether the original read SEQ should be included

Returns: None

def gzip_write( in_bam: str, out_name: str, gzip_level: int, seq_switch: bool = False) -> None:
 90def gzip_write(
 91        in_bam: str,
 92        out_name: str,
 93        gzip_level: int,
 94        seq_switch: bool=False) -> None:
 95    """
 96    Writes the output into a gzip compressed plain text file
 97
 98    Args:
 99        in_bam (str): path to input BAM file
100        out_name (str): name of the output file
101        c_level (int): gzip compression level, default argparse value sets it to 1
102        seq_switch (bool): controls whether the original read SEQ should be
103            included
104
105    Returns:
106        None
107    """
108    with gzip.open(out_name, "wt", compresslevel=gzip_level) as f:
109        basic_write(in_bam, f, seq_switch)
110
111    return None

Writes the output into a gzip compressed plain text file

Args: in_bam (str): path to input BAM file out_name (str): name of the output file c_level (int): gzip compression level, default argparse value sets it to 1 seq_switch (bool): controls whether the original read SEQ should be included

Returns: None

def extractor( in_bam: str, out_name: str, seq_switch: bool, gzip_switch: bool, gzip_level: int) -> None:
113def extractor(in_bam: str,
114              out_name: str,
115              seq_switch: bool,
116              gzip_switch: bool,
117              gzip_level: int) -> None:
118    '''
119    Function for handling the main logic of 'extractor' subcommand
120
121    Args:
122        in_bam (str): input BAM path
123        out_name (str): name of output TSV
124        seq_switch (bool): controls whether to write the original SEQ to the
125            TSV
126        gzip_switch (bool): controls whether to gzip the output TSV
127        gzip_level (int): gzip compression level, default is 2 (set in cli.py
128            parsing module)
129
130    Returns:
131        None
132    '''
133    # open input BAM for reading
134    bamfile = pysam.AlignmentFile(in_bam, "rb")
135
136    # write to tsv.gz or tsv
137    if gzip_switch:
138        gzip_write(
139            in_bam=bamfile,
140            out_name=f"{out_name}.tsv.gz",
141            seq_switch=seq_switch,
142            gzip_level=gzip_level
143        )
144
145    else:
146        plain_write(
147            in_bam=bamfile,
148            out_name=f"{out_name}.tsv",
149            seq_switch=seq_switch
150        )
151
152        # close input BAM
153        bamfile.close()
154
155    return None

Function for handling the main logic of 'extractor' subcommand

Args: in_bam (str): input BAM path out_name (str): name of output TSV seq_switch (bool): controls whether to write the original SEQ to the TSV gzip_switch (bool): controls whether to gzip the output TSV gzip_level (int): gzip compression level, default is 2 (set in cli.py parsing module)

Returns: None