modules.plot

View Source

  1import gzip
  2import matplotlib.pyplot as plt
  3import numpy as np
  4
  5'''
  6plot.py, handles logic of the 'plot' subcommand
  7'''
  8
  9class AnalyseFile:
 10
 11
 12    def __init__(self, path: str):
 13        self.path = path
 14
 15        # stores the sum of each tail type, indices:
 16        # [0] = no_tail, [1] = polyU, [2] = polyA, 
 17        # [3] = mixed_AU, [4] = mixed_GC, [5] = other
 18        self.type_count_arr = np.zeros(6, dtype=np.uint32)
 19
 20        # stores the number of rows in input
 21        self.row_count = 0
 22
 23    # === File handling methods ===
 24
 25    def _get_header(self):
 26        '''
 27        Internal method, used for reading the header of input text file.
 28
 29        Args:
 30            handle: file handle generated by one of the open(file, 'r') variants, eg. gzip.open(file, 'rt')
 31
 32        Returns:
 33            list[str, ...]: header as a \t separated list of strings
 34        '''
 35        return self.handle.readline().strip().split('\t')
 36
 37    def open_gzip(self):
 38        '''
 39        Opens a gzipped file for reading as text and stores its 1st line (header) separately
 40        '''
 41        self.handle = gzip.open(self.path, "rt")
 42        self.header = self._get_header()
 43
 44    def open_plain(self):
 45        '''
 46        Opens a plain text file for reading and stores its 1st line (header) separately
 47        '''
 48        self.handle = open(self.path, 'r')
 49        self.header = self._get_header()
 50
 51    def close(self):
 52        '''
 53        For closing the file after reading
 54        '''
 55        self.handle.close()
 56
 57    def _line_process(self, line):
 58        '''
 59        Used for processing the input lines into manageable data structures (only lists for now)
 60        '''
 61        return line.strip().split('\t')
 62
 63    def __iter__(self):
 64        '''
 65        Return object iterator
 66        '''
 67        return self
 68
 69    def __next__(self):
 70        '''
 71        Define how to get the next element, i.e. line
 72        '''
 73        line = self.handle.readline()
 74        self.line = self._line_process(self.handle.readline())
 75        if line:
 76            self.line = self._line_process(line)
 77            self.row_count += 1     # tick the row counter
 78            self._count_type()
 79        else:
 80            raise StopIteration
 81
 82    # === Statistic calculation methods ===
 83
 84    def _count_type(self) -> None:
 85        '''
 86        Tick the appropriate tail type counter by 1. Value are stored as
 87        np.uint32 in a shape (6,) np array. Indice-value pairs are as follows:
 88        [0] = no_tail, [1] = polyU, [2] = polyA,
 89        [3] = mixed_AU, [4] = mixed_GC, [5] = other
 90        '''
 91        # 13th column stores the tail type
 92        type = self.line[13]
 93
 94        match type:
 95            case "no_tail":
 96                self.type_count_arr[0] += 1
 97            case "polyU":
 98                self.type_count_arr[1] += 1
 99            case "polyA":
100                self.type_count_arr[2] += 1
101            case "mixed_AU":
102                self.type_count_arr[3] += 1
103            case "mixed_GC":
104                self.type_count_arr[4] += 1
105            case "other":
106                self.type_count_arr[5] += 1
107
108    def tail_perc(self) -> np.ndarray:
109        '''
110        Calculate the percentage of reads with each tail type
111
112        Returns:
113            np.ndarray: shape (6,) float64 array with values between 0 and 100
114                representing percentages
115        '''
116        return self.type_count_arr / self.row_count * 100
117
118def plot(filenames: list[str, ...], values: np.ndarray) -> None:
119    '''
120    Make plots from calculated statistics.
121    This function is the main() od plotting. In the future each specific plot
122    should have their own function, and plot() should ONLY be used for
123    coordinating them
124
125    Args:
126        filenames (list[str, ...]): list of filenames to be used as xticks,
127            probably will be changed to "experiment_names" in the next minor
128            refactor
129        values (np.ndarray): 2D array of float64s, rows represent tail types,
130            columns represent filenames/experimental setups
131    
132    Returns:
133        None
134    '''
135
136    # tail types taken into account
137    types = ("no_tail", "polyU", "polyA", "mixed_AU", "mixed_GC", "other")
138
139    # Percentage arrays are stored under their respective tail type keys
140    data = {tail_type: perc_arr for tail_type, perc_arr in zip(types, values)}
141
142    # plot bar width
143
144    width = 0.5
145
146    # === Stacked barplot ===
147    # === log Y scale ===
148
149    bottom = np.zeros(len(filenames))
150
151    for label, value in data.items():
152        plt.bar(filenames, value, width, label=label, bottom=bottom)
153        bottom += value
154
155    plt.title("Tail type percentage")
156    plt.semilogy()
157    plt.xticks(filenames, rotation = 45)
158
159    plt.legend(loc='upper left', bbox_to_anchor=(1.0, 1.0))
160    plt.tight_layout()  # helps avoid clipping
161 
162    plt.savefig("plottest_log.png", dpi=300)
163
164    plt.clf()
165
166    # === linear Y scale ===
167
168    bottom = np.zeros(len(filenames))
169
170    for label, value in data.items():
171        plt.bar(filenames, value, width, label=label, bottom=bottom)
172        bottom += value
173
174    plt.title("Tail type percentage")
175    plt.xticks(filenames, rotation = 45)
176
177    plt.legend(loc='upper left', bbox_to_anchor=(1.0, 1.0))
178    plt.tight_layout()  # helps avoid clipping
179
180    plt.savefig("plottest.png", dpi=300)
181
182    return None
183
184def ploter(infiles: list[str, ...]) -> None:
185    '''
186    Plot figures for input files generated by Slipper analyse. For now only
187    tail type percent per input file is available
188
189    Args:
190        infiles (list[str, ...]): list of filenames or paths to tsv.gz files
191            generated by Slipper analyse
192
193    Returns:
194        None
195    '''
196
197    files = {i: (path, AnalyseFile(path)) for i, path in zip(range(len(infiles)), infiles)}
198    filenames = [filename[0].split('/')[-1] for filename in files.values()]
199
200    for file in files.values():
201
202        # remember file = (path, AnaluseFile(path))
203        # open gziped file for reading
204        file[1].open_gzip()
205
206        # counting rows and tail_type occurences is handled by
207        # AnalyseFile.__next__() method
208        for _ in file[1]:
209            pass
210        
211        file[1].close()
212
213    # AnalyseFile.tail_perc() calculates the relative frequency of each tail
214    # type per input file
215    percs = np.vstack([file[1].tail_perc() for file in files.values()]).transpose()
216
217    # run the main plotting function
218    plot(filenames=filenames, values=percs)

class AnalyseFile: View Source

 10class AnalyseFile:
 11
 12
 13    def __init__(self, path: str):
 14        self.path = path
 15
 16        # stores the sum of each tail type, indices:
 17        # [0] = no_tail, [1] = polyU, [2] = polyA, 
 18        # [3] = mixed_AU, [4] = mixed_GC, [5] = other
 19        self.type_count_arr = np.zeros(6, dtype=np.uint32)
 20
 21        # stores the number of rows in input
 22        self.row_count = 0
 23
 24    # === File handling methods ===
 25
 26    def _get_header(self):
 27        '''
 28        Internal method, used for reading the header of input text file.
 29
 30        Args:
 31            handle: file handle generated by one of the open(file, 'r') variants, eg. gzip.open(file, 'rt')
 32
 33        Returns:
 34            list[str, ...]: header as a \t separated list of strings
 35        '''
 36        return self.handle.readline().strip().split('\t')
 37
 38    def open_gzip(self):
 39        '''
 40        Opens a gzipped file for reading as text and stores its 1st line (header) separately
 41        '''
 42        self.handle = gzip.open(self.path, "rt")
 43        self.header = self._get_header()
 44
 45    def open_plain(self):
 46        '''
 47        Opens a plain text file for reading and stores its 1st line (header) separately
 48        '''
 49        self.handle = open(self.path, 'r')
 50        self.header = self._get_header()
 51
 52    def close(self):
 53        '''
 54        For closing the file after reading
 55        '''
 56        self.handle.close()
 57
 58    def _line_process(self, line):
 59        '''
 60        Used for processing the input lines into manageable data structures (only lists for now)
 61        '''
 62        return line.strip().split('\t')
 63
 64    def __iter__(self):
 65        '''
 66        Return object iterator
 67        '''
 68        return self
 69
 70    def __next__(self):
 71        '''
 72        Define how to get the next element, i.e. line
 73        '''
 74        line = self.handle.readline()
 75        self.line = self._line_process(self.handle.readline())
 76        if line:
 77            self.line = self._line_process(line)
 78            self.row_count += 1     # tick the row counter
 79            self._count_type()
 80        else:
 81            raise StopIteration
 82
 83    # === Statistic calculation methods ===
 84
 85    def _count_type(self) -> None:
 86        '''
 87        Tick the appropriate tail type counter by 1. Value are stored as
 88        np.uint32 in a shape (6,) np array. Indice-value pairs are as follows:
 89        [0] = no_tail, [1] = polyU, [2] = polyA,
 90        [3] = mixed_AU, [4] = mixed_GC, [5] = other
 91        '''
 92        # 13th column stores the tail type
 93        type = self.line[13]
 94
 95        match type:
 96            case "no_tail":
 97                self.type_count_arr[0] += 1
 98            case "polyU":
 99                self.type_count_arr[1] += 1
100            case "polyA":
101                self.type_count_arr[2] += 1
102            case "mixed_AU":
103                self.type_count_arr[3] += 1
104            case "mixed_GC":
105                self.type_count_arr[4] += 1
106            case "other":
107                self.type_count_arr[5] += 1
108
109    def tail_perc(self) -> np.ndarray:
110        '''
111        Calculate the percentage of reads with each tail type
112
113        Returns:
114            np.ndarray: shape (6,) float64 array with values between 0 and 100
115                representing percentages
116        '''
117        return self.type_count_arr / self.row_count * 100

AnalyseFile(path: str) View Source

13    def __init__(self, path: str):
14        self.path = path
15
16        # stores the sum of each tail type, indices:
17        # [0] = no_tail, [1] = polyU, [2] = polyA, 
18        # [3] = mixed_AU, [4] = mixed_GC, [5] = other
19        self.type_count_arr = np.zeros(6, dtype=np.uint32)
20
21        # stores the number of rows in input
22        self.row_count = 0

path

type_count_arr

row_count

def open_gzip(self): View Source

38    def open_gzip(self):
39        '''
40        Opens a gzipped file for reading as text and stores its 1st line (header) separately
41        '''
42        self.handle = gzip.open(self.path, "rt")
43        self.header = self._get_header()

Opens a gzipped file for reading as text and stores its 1st line (header) separately

def open_plain(self): View Source

45    def open_plain(self):
46        '''
47        Opens a plain text file for reading and stores its 1st line (header) separately
48        '''
49        self.handle = open(self.path, 'r')
50        self.header = self._get_header()

Opens a plain text file for reading and stores its 1st line (header) separately

def close(self): View Source

52    def close(self):
53        '''
54        For closing the file after reading
55        '''
56        self.handle.close()

For closing the file after reading

def tail_perc(self) -> numpy.ndarray: View Source

109    def tail_perc(self) -> np.ndarray:
110        '''
111        Calculate the percentage of reads with each tail type
112
113        Returns:
114            np.ndarray: shape (6,) float64 array with values between 0 and 100
115                representing percentages
116        '''
117        return self.type_count_arr / self.row_count * 100

Calculate the percentage of reads with each tail type

Returns: np.ndarray: shape (6,) float64 array with values between 0 and 100 representing percentages

def plot(filenames: list[str, ...], values: numpy.ndarray) -> None: View Source

119def plot(filenames: list[str, ...], values: np.ndarray) -> None:
120    '''
121    Make plots from calculated statistics.
122    This function is the main() od plotting. In the future each specific plot
123    should have their own function, and plot() should ONLY be used for
124    coordinating them
125
126    Args:
127        filenames (list[str, ...]): list of filenames to be used as xticks,
128            probably will be changed to "experiment_names" in the next minor
129            refactor
130        values (np.ndarray): 2D array of float64s, rows represent tail types,
131            columns represent filenames/experimental setups
132    
133    Returns:
134        None
135    '''
136
137    # tail types taken into account
138    types = ("no_tail", "polyU", "polyA", "mixed_AU", "mixed_GC", "other")
139
140    # Percentage arrays are stored under their respective tail type keys
141    data = {tail_type: perc_arr for tail_type, perc_arr in zip(types, values)}
142
143    # plot bar width
144
145    width = 0.5
146
147    # === Stacked barplot ===
148    # === log Y scale ===
149
150    bottom = np.zeros(len(filenames))
151
152    for label, value in data.items():
153        plt.bar(filenames, value, width, label=label, bottom=bottom)
154        bottom += value
155
156    plt.title("Tail type percentage")
157    plt.semilogy()
158    plt.xticks(filenames, rotation = 45)
159
160    plt.legend(loc='upper left', bbox_to_anchor=(1.0, 1.0))
161    plt.tight_layout()  # helps avoid clipping
162 
163    plt.savefig("plottest_log.png", dpi=300)
164
165    plt.clf()
166
167    # === linear Y scale ===
168
169    bottom = np.zeros(len(filenames))
170
171    for label, value in data.items():
172        plt.bar(filenames, value, width, label=label, bottom=bottom)
173        bottom += value
174
175    plt.title("Tail type percentage")
176    plt.xticks(filenames, rotation = 45)
177
178    plt.legend(loc='upper left', bbox_to_anchor=(1.0, 1.0))
179    plt.tight_layout()  # helps avoid clipping
180
181    plt.savefig("plottest.png", dpi=300)
182
183    return None

Make plots from calculated statistics. This function is the main() od plotting. In the future each specific plot should have their own function, and plot() should ONLY be used for coordinating them

Args: filenames (list[str, ...]): list of filenames to be used as xticks, probably will be changed to "experiment_names" in the next minor refactor values (np.ndarray): 2D array of float64s, rows represent tail types, columns represent filenames/experimental setups

Returns: None

def ploter(infiles: list[str, ...]) -> None: View Source

185def ploter(infiles: list[str, ...]) -> None:
186    '''
187    Plot figures for input files generated by Slipper analyse. For now only
188    tail type percent per input file is available
189
190    Args:
191        infiles (list[str, ...]): list of filenames or paths to tsv.gz files
192            generated by Slipper analyse
193
194    Returns:
195        None
196    '''
197
198    files = {i: (path, AnalyseFile(path)) for i, path in zip(range(len(infiles)), infiles)}
199    filenames = [filename[0].split('/')[-1] for filename in files.values()]
200
201    for file in files.values():
202
203        # remember file = (path, AnaluseFile(path))
204        # open gziped file for reading
205        file[1].open_gzip()
206
207        # counting rows and tail_type occurences is handled by
208        # AnalyseFile.__next__() method
209        for _ in file[1]:
210            pass
211        
212        file[1].close()
213
214    # AnalyseFile.tail_perc() calculates the relative frequency of each tail
215    # type per input file
216    percs = np.vstack([file[1].tail_perc() for file in files.values()]).transpose()
217
218    # run the main plotting function
219    plot(filenames=filenames, values=percs)

Plot figures for input files generated by Slipper analyse. For now only tail type percent per input file is available

Args: infiles (list[str, ...]): list of filenames or paths to tsv.gz files generated by Slipper analyse

Returns: None