modules.plot
1import gzip 2import matplotlib.pyplot as plt 3import numpy as np 4 5''' 6plot.py, handles logic of the 'plot' subcommand 7''' 8 9class AnalyseFile: 10 11 12 def __init__(self, path: str): 13 self.path = path 14 15 # stores the sum of each tail type, indices: 16 # [0] = no_tail, [1] = polyU, [2] = polyA, 17 # [3] = mixed_AU, [4] = mixed_GC, [5] = other 18 self.type_count_arr = np.zeros(6, dtype=np.uint32) 19 20 # stores the number of rows in input 21 self.row_count = 0 22 23 # === File handling methods === 24 25 def _get_header(self): 26 ''' 27 Internal method, used for reading the header of input text file. 28 29 Args: 30 handle: file handle generated by one of the open(file, 'r') variants, eg. gzip.open(file, 'rt') 31 32 Returns: 33 list[str, ...]: header as a \t separated list of strings 34 ''' 35 return self.handle.readline().strip().split('\t') 36 37 def open_gzip(self): 38 ''' 39 Opens a gzipped file for reading as text and stores its 1st line (header) separately 40 ''' 41 self.handle = gzip.open(self.path, "rt") 42 self.header = self._get_header() 43 44 def open_plain(self): 45 ''' 46 Opens a plain text file for reading and stores its 1st line (header) separately 47 ''' 48 self.handle = open(self.path, 'r') 49 self.header = self._get_header() 50 51 def close(self): 52 ''' 53 For closing the file after reading 54 ''' 55 self.handle.close() 56 57 def _line_process(self, line): 58 ''' 59 Used for processing the input lines into manageable data structures (only lists for now) 60 ''' 61 return line.strip().split('\t') 62 63 def __iter__(self): 64 ''' 65 Return object iterator 66 ''' 67 return self 68 69 def __next__(self): 70 ''' 71 Define how to get the next element, i.e. line 72 ''' 73 line = self.handle.readline() 74 self.line = self._line_process(self.handle.readline()) 75 if line: 76 self.line = self._line_process(line) 77 self.row_count += 1 # tick the row counter 78 self._count_type() 79 else: 80 raise StopIteration 81 82 # === Statistic calculation methods === 83 84 def _count_type(self) -> None: 85 ''' 86 Tick the appropriate tail type counter by 1. Value are stored as 87 np.uint32 in a shape (6,) np array. Indice-value pairs are as follows: 88 [0] = no_tail, [1] = polyU, [2] = polyA, 89 [3] = mixed_AU, [4] = mixed_GC, [5] = other 90 ''' 91 # 13th column stores the tail type 92 type = self.line[13] 93 94 match type: 95 case "no_tail": 96 self.type_count_arr[0] += 1 97 case "polyU": 98 self.type_count_arr[1] += 1 99 case "polyA": 100 self.type_count_arr[2] += 1 101 case "mixed_AU": 102 self.type_count_arr[3] += 1 103 case "mixed_GC": 104 self.type_count_arr[4] += 1 105 case "other": 106 self.type_count_arr[5] += 1 107 108 def tail_perc(self) -> np.ndarray: 109 ''' 110 Calculate the percentage of reads with each tail type 111 112 Returns: 113 np.ndarray: shape (6,) float64 array with values between 0 and 100 114 representing percentages 115 ''' 116 return self.type_count_arr / self.row_count * 100 117 118def plot(filenames: list[str, ...], values: np.ndarray) -> None: 119 ''' 120 Make plots from calculated statistics. 121 This function is the main() od plotting. In the future each specific plot 122 should have their own function, and plot() should ONLY be used for 123 coordinating them 124 125 Args: 126 filenames (list[str, ...]): list of filenames to be used as xticks, 127 probably will be changed to "experiment_names" in the next minor 128 refactor 129 values (np.ndarray): 2D array of float64s, rows represent tail types, 130 columns represent filenames/experimental setups 131 132 Returns: 133 None 134 ''' 135 136 # tail types taken into account 137 types = ("no_tail", "polyU", "polyA", "mixed_AU", "mixed_GC", "other") 138 139 # Percentage arrays are stored under their respective tail type keys 140 data = {tail_type: perc_arr for tail_type, perc_arr in zip(types, values)} 141 142 # plot bar width 143 144 width = 0.5 145 146 # === Stacked barplot === 147 # === log Y scale === 148 149 bottom = np.zeros(len(filenames)) 150 151 for label, value in data.items(): 152 plt.bar(filenames, value, width, label=label, bottom=bottom) 153 bottom += value 154 155 plt.title("Tail type percentage") 156 plt.semilogy() 157 plt.xticks(filenames, rotation = 45) 158 159 plt.legend(loc='upper left', bbox_to_anchor=(1.0, 1.0)) 160 plt.tight_layout() # helps avoid clipping 161 162 plt.savefig("plottest_log.png", dpi=300) 163 164 plt.clf() 165 166 # === linear Y scale === 167 168 bottom = np.zeros(len(filenames)) 169 170 for label, value in data.items(): 171 plt.bar(filenames, value, width, label=label, bottom=bottom) 172 bottom += value 173 174 plt.title("Tail type percentage") 175 plt.xticks(filenames, rotation = 45) 176 177 plt.legend(loc='upper left', bbox_to_anchor=(1.0, 1.0)) 178 plt.tight_layout() # helps avoid clipping 179 180 plt.savefig("plottest.png", dpi=300) 181 182 return None 183 184def ploter(infiles: list[str, ...]) -> None: 185 ''' 186 Plot figures for input files generated by Slipper analyse. For now only 187 tail type percent per input file is available 188 189 Args: 190 infiles (list[str, ...]): list of filenames or paths to tsv.gz files 191 generated by Slipper analyse 192 193 Returns: 194 None 195 ''' 196 197 files = {i: (path, AnalyseFile(path)) for i, path in zip(range(len(infiles)), infiles)} 198 filenames = [filename[0].split('/')[-1] for filename in files.values()] 199 200 for file in files.values(): 201 202 # remember file = (path, AnaluseFile(path)) 203 # open gziped file for reading 204 file[1].open_gzip() 205 206 # counting rows and tail_type occurences is handled by 207 # AnalyseFile.__next__() method 208 for _ in file[1]: 209 pass 210 211 file[1].close() 212 213 # AnalyseFile.tail_perc() calculates the relative frequency of each tail 214 # type per input file 215 percs = np.vstack([file[1].tail_perc() for file in files.values()]).transpose() 216 217 # run the main plotting function 218 plot(filenames=filenames, values=percs)
10class AnalyseFile: 11 12 13 def __init__(self, path: str): 14 self.path = path 15 16 # stores the sum of each tail type, indices: 17 # [0] = no_tail, [1] = polyU, [2] = polyA, 18 # [3] = mixed_AU, [4] = mixed_GC, [5] = other 19 self.type_count_arr = np.zeros(6, dtype=np.uint32) 20 21 # stores the number of rows in input 22 self.row_count = 0 23 24 # === File handling methods === 25 26 def _get_header(self): 27 ''' 28 Internal method, used for reading the header of input text file. 29 30 Args: 31 handle: file handle generated by one of the open(file, 'r') variants, eg. gzip.open(file, 'rt') 32 33 Returns: 34 list[str, ...]: header as a \t separated list of strings 35 ''' 36 return self.handle.readline().strip().split('\t') 37 38 def open_gzip(self): 39 ''' 40 Opens a gzipped file for reading as text and stores its 1st line (header) separately 41 ''' 42 self.handle = gzip.open(self.path, "rt") 43 self.header = self._get_header() 44 45 def open_plain(self): 46 ''' 47 Opens a plain text file for reading and stores its 1st line (header) separately 48 ''' 49 self.handle = open(self.path, 'r') 50 self.header = self._get_header() 51 52 def close(self): 53 ''' 54 For closing the file after reading 55 ''' 56 self.handle.close() 57 58 def _line_process(self, line): 59 ''' 60 Used for processing the input lines into manageable data structures (only lists for now) 61 ''' 62 return line.strip().split('\t') 63 64 def __iter__(self): 65 ''' 66 Return object iterator 67 ''' 68 return self 69 70 def __next__(self): 71 ''' 72 Define how to get the next element, i.e. line 73 ''' 74 line = self.handle.readline() 75 self.line = self._line_process(self.handle.readline()) 76 if line: 77 self.line = self._line_process(line) 78 self.row_count += 1 # tick the row counter 79 self._count_type() 80 else: 81 raise StopIteration 82 83 # === Statistic calculation methods === 84 85 def _count_type(self) -> None: 86 ''' 87 Tick the appropriate tail type counter by 1. Value are stored as 88 np.uint32 in a shape (6,) np array. Indice-value pairs are as follows: 89 [0] = no_tail, [1] = polyU, [2] = polyA, 90 [3] = mixed_AU, [4] = mixed_GC, [5] = other 91 ''' 92 # 13th column stores the tail type 93 type = self.line[13] 94 95 match type: 96 case "no_tail": 97 self.type_count_arr[0] += 1 98 case "polyU": 99 self.type_count_arr[1] += 1 100 case "polyA": 101 self.type_count_arr[2] += 1 102 case "mixed_AU": 103 self.type_count_arr[3] += 1 104 case "mixed_GC": 105 self.type_count_arr[4] += 1 106 case "other": 107 self.type_count_arr[5] += 1 108 109 def tail_perc(self) -> np.ndarray: 110 ''' 111 Calculate the percentage of reads with each tail type 112 113 Returns: 114 np.ndarray: shape (6,) float64 array with values between 0 and 100 115 representing percentages 116 ''' 117 return self.type_count_arr / self.row_count * 100
13 def __init__(self, path: str): 14 self.path = path 15 16 # stores the sum of each tail type, indices: 17 # [0] = no_tail, [1] = polyU, [2] = polyA, 18 # [3] = mixed_AU, [4] = mixed_GC, [5] = other 19 self.type_count_arr = np.zeros(6, dtype=np.uint32) 20 21 # stores the number of rows in input 22 self.row_count = 0
38 def open_gzip(self): 39 ''' 40 Opens a gzipped file for reading as text and stores its 1st line (header) separately 41 ''' 42 self.handle = gzip.open(self.path, "rt") 43 self.header = self._get_header()
Opens a gzipped file for reading as text and stores its 1st line (header) separately
45 def open_plain(self): 46 ''' 47 Opens a plain text file for reading and stores its 1st line (header) separately 48 ''' 49 self.handle = open(self.path, 'r') 50 self.header = self._get_header()
Opens a plain text file for reading and stores its 1st line (header) separately
109 def tail_perc(self) -> np.ndarray: 110 ''' 111 Calculate the percentage of reads with each tail type 112 113 Returns: 114 np.ndarray: shape (6,) float64 array with values between 0 and 100 115 representing percentages 116 ''' 117 return self.type_count_arr / self.row_count * 100
Calculate the percentage of reads with each tail type
Returns: np.ndarray: shape (6,) float64 array with values between 0 and 100 representing percentages
119def plot(filenames: list[str, ...], values: np.ndarray) -> None: 120 ''' 121 Make plots from calculated statistics. 122 This function is the main() od plotting. In the future each specific plot 123 should have their own function, and plot() should ONLY be used for 124 coordinating them 125 126 Args: 127 filenames (list[str, ...]): list of filenames to be used as xticks, 128 probably will be changed to "experiment_names" in the next minor 129 refactor 130 values (np.ndarray): 2D array of float64s, rows represent tail types, 131 columns represent filenames/experimental setups 132 133 Returns: 134 None 135 ''' 136 137 # tail types taken into account 138 types = ("no_tail", "polyU", "polyA", "mixed_AU", "mixed_GC", "other") 139 140 # Percentage arrays are stored under their respective tail type keys 141 data = {tail_type: perc_arr for tail_type, perc_arr in zip(types, values)} 142 143 # plot bar width 144 145 width = 0.5 146 147 # === Stacked barplot === 148 # === log Y scale === 149 150 bottom = np.zeros(len(filenames)) 151 152 for label, value in data.items(): 153 plt.bar(filenames, value, width, label=label, bottom=bottom) 154 bottom += value 155 156 plt.title("Tail type percentage") 157 plt.semilogy() 158 plt.xticks(filenames, rotation = 45) 159 160 plt.legend(loc='upper left', bbox_to_anchor=(1.0, 1.0)) 161 plt.tight_layout() # helps avoid clipping 162 163 plt.savefig("plottest_log.png", dpi=300) 164 165 plt.clf() 166 167 # === linear Y scale === 168 169 bottom = np.zeros(len(filenames)) 170 171 for label, value in data.items(): 172 plt.bar(filenames, value, width, label=label, bottom=bottom) 173 bottom += value 174 175 plt.title("Tail type percentage") 176 plt.xticks(filenames, rotation = 45) 177 178 plt.legend(loc='upper left', bbox_to_anchor=(1.0, 1.0)) 179 plt.tight_layout() # helps avoid clipping 180 181 plt.savefig("plottest.png", dpi=300) 182 183 return None
Make plots from calculated statistics. This function is the main() od plotting. In the future each specific plot should have their own function, and plot() should ONLY be used for coordinating them
Args: filenames (list[str, ...]): list of filenames to be used as xticks, probably will be changed to "experiment_names" in the next minor refactor values (np.ndarray): 2D array of float64s, rows represent tail types, columns represent filenames/experimental setups
Returns: None
185def ploter(infiles: list[str, ...]) -> None: 186 ''' 187 Plot figures for input files generated by Slipper analyse. For now only 188 tail type percent per input file is available 189 190 Args: 191 infiles (list[str, ...]): list of filenames or paths to tsv.gz files 192 generated by Slipper analyse 193 194 Returns: 195 None 196 ''' 197 198 files = {i: (path, AnalyseFile(path)) for i, path in zip(range(len(infiles)), infiles)} 199 filenames = [filename[0].split('/')[-1] for filename in files.values()] 200 201 for file in files.values(): 202 203 # remember file = (path, AnaluseFile(path)) 204 # open gziped file for reading 205 file[1].open_gzip() 206 207 # counting rows and tail_type occurences is handled by 208 # AnalyseFile.__next__() method 209 for _ in file[1]: 210 pass 211 212 file[1].close() 213 214 # AnalyseFile.tail_perc() calculates the relative frequency of each tail 215 # type per input file 216 percs = np.vstack([file[1].tail_perc() for file in files.values()]).transpose() 217 218 # run the main plotting function 219 plot(filenames=filenames, values=percs)
Plot figures for input files generated by Slipper analyse. For now only tail type percent per input file is available
Args: infiles (list[str, ...]): list of filenames or paths to tsv.gz files generated by Slipper analyse
Returns: None