SS-train (Python)

Generate a training feature file for SVMlight. This script takes the fasta file, a PSSM and anchored MSA generated for the sequence as well as the true secondary structure extracted with DSSP. Helical residues will be labeled "1" and Coil/Beta sheet residues as "-1";

import sys
import PCPML
from PCPML_Utilities import ReadFile
 
if(len(sys.argv) != 5):
    print "Usage:", sys.argv[0], "<fasta> <pssm> <anchored_msa> <dssp>";
    sys.exit(-1);
 
fasta_filename = sys.argv[1];
pssm_filename = sys.argv[2];
msa_filename = sys.argv[3];
dssp_filename = sys.argv[4];
 
lines = PCPML.Lines();
 
# Read in sequence
ReadFile(fasta_filename, lines);
header = lines[0];
sequence = "";
for i in range(1, lines.size()):
    sequence = sequence + lines[i]
 
# Read in pssm
lines.clear();
ReadFile(pssm_filename, lines);
pssm = PCPML.Matrix();
inf = PCPML.VectDouble();
PCPML.ParseAsciiPSSM(lines, pssm, inf, 1);
 
# Read in msa
lines.clear();
ReadFile(msa_filename, lines);
msa = PCPML.Matrix();
PCPML.ParseAnchoredMSA(lines, msa);
 
# Read in dssp
lines.clear();
ReadFile(dssp_filename, lines);
True_SS = PCPML.ParseDSSPOutput(lines, PCPML.SS);
DSSP_Seq = PCPML.ParseDSSPOutput(lines, PCPML.SEQUENCE);
 
f = PCPML.VectDouble();
for center in range(4, len(sequence)-6):
    f.clear();
 
    for idx in range(center-4, center+5):
        PCPML.HotEncodeAA(sequence[idx], f);
        for i in range(0,pssm[i].size()):
            f.push_back(pssm[idx][i]);
        for i in range(0,msa[idx].size()):
            f.push_back(msa[idx][i]);
        f.push_back(inf[idx]);
 
    # Write out features
    if True_SS[center] == 'H':
        sys.stdout.write("1 ");
    else:
        sys.stdout.write("-1 ");
    PCPML.PrintFeatures(f, 1, 1);