21_Analysis_Datastructure/DSSP_visualizer_updateBCchapter1.py at master · NikJur/21_Analysis_Datastructure · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import os
import numpy as np
import matplotlib.pyplot as plt


def load_dssp_matrix(file_path):
    """
    Loads and subsamples a DSSP data file into a NumPy matrix.

    This function reads a DSSP output file, skipping any header lines that start with '#'.
    It is designed to subsample the data by processing every 10th line (frame), which can be
    useful for large simulation datasets. The characters in each line are converted to their
    ASCII integer representations for numerical analysis. The function stops reading after
    line 10030 to focus on a specific portion of the simulation.

    Args:
        file_path (str): The full path to the DSSP .dat file.

    Returns:
        np.ndarray: A 2D NumPy array where each row represents a time frame and each column
                    represents a residue's secondary structure assignment.

    Raises:
        ValueError: If an error occurs during file loading or processing, indicating a
                    potential issue with the file path or format.
    """
    try:
        file_content = []
        with open(file_path, 'r') as file:
            line_number = 0
            for line in file:
                line_number += 1
                if line.startswith('#'):
                    continue
                # Subsample the data by processing every 10th frame
                if line_number % 10 != 0:
                    continue

                # Convert the characters in the line to their ASCII values
                row = [ord(character) for character in line.rstrip()]
                file_content.append(row)

                # Set a limit on the number of lines to read to manage data size
                if line_number >= 10030:
                    break

        return np.array(file_content)

    except Exception as e:
        print(f"Error loading DSSP matrix from {file_path}: {e}")
        raise ValueError("Error loading DSSP matrix. Check the file structure.")


def calculate_structure_percentages(dssp_matrix):
    """
    Calculates the percentage of different secondary structures per frame.

    This function takes a DSSP matrix (from `load_dssp_matrix`) and categorizes the secondary
    structures into three main groups: alpha-helical ('H', 'G', 'I', 'P'), beta-structures
    ('B', 'E'), and other structures ('S', 'T', '=', '~'). It then calculates the percentage
    of each structural group for every time frame in the matrix.

    Args:
        dssp_matrix (np.ndarray): A matrix where rows are frames and columns are residues,
                                  with values being the ASCII codes of DSSP characters.

    Returns:
        tuple: A tuple of three NumPy arrays (p_alpha, p_beta, p_other). Each array contains
               the percentage of its respective structure type for each frame over time.
    """
    # Calculate the total number of residues per frame, excluding gaps ('-')
    total_counts = np.sum(dssp_matrix != ord('-'), axis=1)

    # Count residues belonging to the alpha-helical group
    counts_alpha_helix = np.sum(
        (dssp_matrix == ord('H')) | (dssp_matrix == ord('G')) |
        (dssp_matrix == ord('I')) | (dssp_matrix == ord('P')), axis=1)

    # Count residues belonging to the beta-structure group
    counts_beta_structures = np.sum(
        (dssp_matrix == ord('B')) | (dssp_matrix == ord('E')), axis=1)

    # Count residues belonging to the 'other' group (turns, bends, loops)
    counts_other_structures = np.sum(
        (dssp_matrix == ord('S')) | (dssp_matrix == ord('T')) |
        (dssp_matrix == ord('=')) | (dssp_matrix == ord('~')), axis=1)

    # Calculate percentages for each group. The 'where' argument prevents division by zero
    # for any frames that might have no valid residues.
    p_alpha = np.divide(counts_alpha_helix * 100, total_counts, out=np.zeros_like(total_counts, dtype=float),
                        where=total_counts != 0)
    p_beta = np.divide(counts_beta_structures * 100, total_counts, out=np.zeros_like(total_counts, dtype=float),
                       where=total_counts != 0)
    p_other = np.divide(counts_other_structures * 100, total_counts, out=np.zeros_like(total_counts, dtype=float),
                        where=total_counts != 0)

    return p_alpha, p_beta, p_other


def plot_multi_take_evolution(folder_path, num_rows=3, num_cols=3):
    """
    Generates and saves a grid of plots showing secondary structure evolution.

    This function reads '.dat' files in a specified order, creating a grid of line plots. Each
    subplot displays the percentage of alpha-helices, beta-structures, and other structures
    as a function of time. The plots are styled for academic publications with individual axis
    labels, ticks, and a grid for each subplot, plus a shared legend at the bottom.

    Args:
        folder_path (str): The path to the directory containing the DSSP '.dat' files.
        num_rows (int, optional): The number of rows in the subplot grid. Defaults to 3.
        num_cols (int, optional): The number of columns in the subplot grid. Defaults to 3.
    """
    # Define the desired order for plotting to group proteins together
    list_of_files = [
        'AMBER_1_RbsB_dssp.dat', 'CHARMM_1_RbsB_dssp.dat', 'SIRAH_1_RbsB_dssp.dat',
        'AMBER_1_FkpA_dssp.dat', 'CHARMM_1_FkpA_dssp.dat', 'SIRAH_1_FkpA_dssp.dat',
        'AMBER_1_SpeA_dssp.dat', 'CHARMM_1_SpeA_dssp.dat', 'SIRAH_1_SpeA_dssp.dat'
    ]

    # Verify that all expected files exist in the folder
    file_names = [f for f in list_of_files if os.path.exists(os.path.join(folder_path, f))]
    if len(file_names) != len(list_of_files):
        print("Warning: Not all expected files were found in the directory.")
    if not file_names:
        print(f"Error: No specified '.dat' files were found in the directory '{folder_path}'.")
        return

    # Create a subplot grid. sharex/sharey are False for individual axes.
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(12, 10), dpi=300)
    axes = axes.flatten()  # Flatten the 2D array of axes for straightforward iteration

    # Define the colors and labels for the different structural elements
    plot_elements = {
        'alpha': {'color': 'red', 'label': r'$\alpha$-Helices'},
        'beta': {'color': 'green', 'label': r'$\beta$-Structures'},
        'other': {'color': 'yellow', 'label': 'Other Structures'}
    }

    # Iterate through each data file and its corresponding subplot axis
    for i, file_name in enumerate(file_names):
        if i >= len(axes):
            print(f"Warning: There are more files ({len(file_names)}) than available subplots ({len(axes)}). "
                  f"Ignoring extra files starting from '{file_name}'.")
            break

        ax = axes[i]
        file_path = os.path.join(folder_path, file_name)

        # Load the data and calculate percentages using the helper functions
        dssp_matrix = load_dssp_matrix(file_path)
        p_alpha, p_beta, p_other = calculate_structure_percentages(dssp_matrix)

        # Create a time axis scaled from 0.0 to 1.0 µs for the x-axis
        time_us = np.linspace(0.0, 1.0, len(p_alpha))

        # Plot the three structure types as independent lines on the current axis
        ax.plot(time_us, p_alpha, color=plot_elements['alpha']['color'], label=plot_elements['alpha']['label'])
        ax.plot(time_us, p_beta, color=plot_elements['beta']['color'], label=plot_elements['beta']['label'])
        ax.plot(time_us, p_other, color=plot_elements['other']['color'], label=plot_elements['other']['label'])

        # Add a grid to the background of each plot
        ax.grid(True, color='grey', linestyle='--', linewidth=0.6, alpha=0.7)

        # Set labels, ticks, and limits for each individual subplot
        ax.set_xlabel(r'Time ($\mu$s)', fontsize=18)
        ax.set_ylabel('Percentage (%)', fontsize=18)
        ax.set_ylim(0, 100)
        ax.set_xlim(0.0, 1.0)
        ax.set_yticks(range(0, 101, 20))
        ax.set_xticks(np.arange(0.0, 1.1, 0.2))
        ax.tick_params(axis='both', which='major', labelsize=16)

    # Deactivate any unused subplots if there are fewer files than grid cells
    for j in range(len(file_names), len(axes)):
        axes[j].set_axis_off()

    # Create a single, shared legend at the bottom of the figure for all subplots
    handles, labels = axes[0].get_legend_handles_labels()
    fig.legend(handles, labels, loc='lower center', bbox_to_anchor=(0.5, 0.01), ncol=3, frameon=False, fontsize=11)

    # Adjust the layout to prevent labels from overlapping and ensure all elements fit
    plt.tight_layout(rect=[0.02, 0.05, 1, 0.96])  # rect=[left, bottom, right, top]

    # Save the figure to a file and then display it
    plt.savefig("secondary_structure_evolution_all_gridded.png", dpi=300, bbox_inches='tight')
    plt.show()


# --- Main execution block ---
if __name__ == "__main__":
    # --- Configuration ---
    # Set the path to the folder containing your DSSP .dat files.
    folder_path = "/ceph/users/akv16273/34_Elcock_13abundant_inEColi/21_Analysis_Datastructure/plots/dssp_results/"

    # --- Script Execution ---
    # Verify that the specified directory exists before attempting to plot
    if os.path.isdir(folder_path):
        # Call the main plotting function with a 3x3 grid to fit all 9 simulations.
        plot_multi_take_evolution(folder_path, num_rows=3, num_cols=3)
    else:
        # Provide a helpful error message if the directory is not found
        print(f"Error: Directory not found at '{folder_path}'")
        print("Please update the 'folder_path' variable in the script with the correct location of your data files.")