modules.extract
1import io, gzip, tempfile, shutil 2import pysam 3from dataclasses import dataclass, astuple 4 5''' 6extract.py, handles logic of the 'extract' subcommand 7''' 8 9@dataclass 10class TsvLine: 11 ''' 12 Each field needs to be a str, because they're '\t'.join't before writing to file. 13 ''' 14 QNAME: str 15 FLAGS: str 16 RNAME: str 17 POS: str 18 CLIP5: str 19 CLIP3: str 20 SEQ: str 21 22def basic_write( 23 in_bam: pysam.AlignmentFile, 24 write_file: str, 25 seq_switch: bool) -> None: 26 """ 27 Basic function for writing the output tsv file, it's used for: buffered 28 plain text tsv writing and gzip compressed tsv writing within plain_write() 29 and gzip_write(). 30 31 Args: 32 in_bam (str): path to input BAM file 33 write_file (str): name of the output file 34 seq_switch (bool): controls whether SEQ column should contain whole read 35 sequence or NA 36 37 Returns: 38 None 39 """ 40 41 # write column headers to output file 42 write_file.write(f"QNAME\tFLAGS\tRNAME\tPOS\tCLIP5\tCLIP3\tSEQ\n") 43 44 # iterate over bam lines 45 for read in in_bam.fetch(): 46 47 # get CIGAR string in tuple format 48 # eg. (4, 7) is the same as 7S 49 cigar = read.cigartuples 50 51 # initialize dataclass with row values, all values must be strings 52 line = TsvLine( 53 read.query_name, 54 f"{read.flag}:{read.flag:012b}", 55 read.reference_name, 56 str(read.reference_start), 57 read.query_sequence[:cigar[0][1]] if cigar[0][0] == 4 else "NA", 58 read.query_sequence[:cigar[-1][1]] if cigar[-1][0] == 4 else "NA", 59 read.query_sequence if seq_switch else "NA" 60 ) 61 62 # write row into the output tsv file 63 write_file.write('\t'.join(astuple(line)) + '\n') 64 65 return None 66 67def plain_write( 68 in_bam: str, 69 out_name: str, 70 seq_switch: bool=False 71 ) -> None: 72 """ 73 Writes the output into a plain text file 74 75 Args: 76 in_bam (str): path to input BAM file 77 out_name (str): name of the output file 78 seq_switch (bool): controls whether the original read SEQ should be 79 included 80 81 Returns: 82 None 83 """ 84 with io.open(out_name, 'w', buffering=524_288) as f: 85 basic_write(in_bam, f, seq_switch) 86 87 return None 88 89def gzip_write( 90 in_bam: str, 91 out_name: str, 92 gzip_level: int, 93 seq_switch: bool=False) -> None: 94 """ 95 Writes the output into a gzip compressed plain text file 96 97 Args: 98 in_bam (str): path to input BAM file 99 out_name (str): name of the output file 100 c_level (int): gzip compression level, default argparse value sets it to 1 101 seq_switch (bool): controls whether the original read SEQ should be 102 included 103 104 Returns: 105 None 106 """ 107 with gzip.open(out_name, "wt", compresslevel=gzip_level) as f: 108 basic_write(in_bam, f, seq_switch) 109 110 return None 111 112def extractor(in_bam: str, 113 out_name: str, 114 seq_switch: bool, 115 gzip_switch: bool, 116 gzip_level: int) -> None: 117 ''' 118 Function for handling the main logic of 'extractor' subcommand 119 120 Args: 121 in_bam (str): input BAM path 122 out_name (str): name of output TSV 123 seq_switch (bool): controls whether to write the original SEQ to the 124 TSV 125 gzip_switch (bool): controls whether to gzip the output TSV 126 gzip_level (int): gzip compression level, default is 2 (set in cli.py 127 parsing module) 128 129 Returns: 130 None 131 ''' 132 # open input BAM for reading 133 bamfile = pysam.AlignmentFile(in_bam, "rb") 134 135 # write to tsv.gz or tsv 136 if gzip_switch: 137 gzip_write( 138 in_bam=bamfile, 139 out_name=f"{out_name}.tsv.gz", 140 seq_switch=seq_switch, 141 gzip_level=gzip_level 142 ) 143 144 else: 145 plain_write( 146 in_bam=bamfile, 147 out_name=f"{out_name}.tsv", 148 seq_switch=seq_switch 149 ) 150 151 # close input BAM 152 bamfile.close() 153 154 return None
10@dataclass 11class TsvLine: 12 ''' 13 Each field needs to be a str, because they're '\t'.join't before writing to file. 14 ''' 15 QNAME: str 16 FLAGS: str 17 RNAME: str 18 POS: str 19 CLIP5: str 20 CLIP3: str 21 SEQ: str
Each field needs to be a str, because they're ' '.join't before writing to file.
23def basic_write( 24 in_bam: pysam.AlignmentFile, 25 write_file: str, 26 seq_switch: bool) -> None: 27 """ 28 Basic function for writing the output tsv file, it's used for: buffered 29 plain text tsv writing and gzip compressed tsv writing within plain_write() 30 and gzip_write(). 31 32 Args: 33 in_bam (str): path to input BAM file 34 write_file (str): name of the output file 35 seq_switch (bool): controls whether SEQ column should contain whole read 36 sequence or NA 37 38 Returns: 39 None 40 """ 41 42 # write column headers to output file 43 write_file.write(f"QNAME\tFLAGS\tRNAME\tPOS\tCLIP5\tCLIP3\tSEQ\n") 44 45 # iterate over bam lines 46 for read in in_bam.fetch(): 47 48 # get CIGAR string in tuple format 49 # eg. (4, 7) is the same as 7S 50 cigar = read.cigartuples 51 52 # initialize dataclass with row values, all values must be strings 53 line = TsvLine( 54 read.query_name, 55 f"{read.flag}:{read.flag:012b}", 56 read.reference_name, 57 str(read.reference_start), 58 read.query_sequence[:cigar[0][1]] if cigar[0][0] == 4 else "NA", 59 read.query_sequence[:cigar[-1][1]] if cigar[-1][0] == 4 else "NA", 60 read.query_sequence if seq_switch else "NA" 61 ) 62 63 # write row into the output tsv file 64 write_file.write('\t'.join(astuple(line)) + '\n') 65 66 return None
Basic function for writing the output tsv file, it's used for: buffered plain text tsv writing and gzip compressed tsv writing within plain_write() and gzip_write().
Args: in_bam (str): path to input BAM file write_file (str): name of the output file seq_switch (bool): controls whether SEQ column should contain whole read sequence or NA
Returns: None
68def plain_write( 69 in_bam: str, 70 out_name: str, 71 seq_switch: bool=False 72 ) -> None: 73 """ 74 Writes the output into a plain text file 75 76 Args: 77 in_bam (str): path to input BAM file 78 out_name (str): name of the output file 79 seq_switch (bool): controls whether the original read SEQ should be 80 included 81 82 Returns: 83 None 84 """ 85 with io.open(out_name, 'w', buffering=524_288) as f: 86 basic_write(in_bam, f, seq_switch) 87 88 return None
Writes the output into a plain text file
Args: in_bam (str): path to input BAM file out_name (str): name of the output file seq_switch (bool): controls whether the original read SEQ should be included
Returns: None
90def gzip_write( 91 in_bam: str, 92 out_name: str, 93 gzip_level: int, 94 seq_switch: bool=False) -> None: 95 """ 96 Writes the output into a gzip compressed plain text file 97 98 Args: 99 in_bam (str): path to input BAM file 100 out_name (str): name of the output file 101 c_level (int): gzip compression level, default argparse value sets it to 1 102 seq_switch (bool): controls whether the original read SEQ should be 103 included 104 105 Returns: 106 None 107 """ 108 with gzip.open(out_name, "wt", compresslevel=gzip_level) as f: 109 basic_write(in_bam, f, seq_switch) 110 111 return None
Writes the output into a gzip compressed plain text file
Args: in_bam (str): path to input BAM file out_name (str): name of the output file c_level (int): gzip compression level, default argparse value sets it to 1 seq_switch (bool): controls whether the original read SEQ should be included
Returns: None
113def extractor(in_bam: str, 114 out_name: str, 115 seq_switch: bool, 116 gzip_switch: bool, 117 gzip_level: int) -> None: 118 ''' 119 Function for handling the main logic of 'extractor' subcommand 120 121 Args: 122 in_bam (str): input BAM path 123 out_name (str): name of output TSV 124 seq_switch (bool): controls whether to write the original SEQ to the 125 TSV 126 gzip_switch (bool): controls whether to gzip the output TSV 127 gzip_level (int): gzip compression level, default is 2 (set in cli.py 128 parsing module) 129 130 Returns: 131 None 132 ''' 133 # open input BAM for reading 134 bamfile = pysam.AlignmentFile(in_bam, "rb") 135 136 # write to tsv.gz or tsv 137 if gzip_switch: 138 gzip_write( 139 in_bam=bamfile, 140 out_name=f"{out_name}.tsv.gz", 141 seq_switch=seq_switch, 142 gzip_level=gzip_level 143 ) 144 145 else: 146 plain_write( 147 in_bam=bamfile, 148 out_name=f"{out_name}.tsv", 149 seq_switch=seq_switch 150 ) 151 152 # close input BAM 153 bamfile.close() 154 155 return None
Function for handling the main logic of 'extractor' subcommand
Args: in_bam (str): input BAM path out_name (str): name of output TSV seq_switch (bool): controls whether to write the original SEQ to the TSV gzip_switch (bool): controls whether to gzip the output TSV gzip_level (int): gzip compression level, default is 2 (set in cli.py parsing module)
Returns: None