SS-train (C++)

1. Boiler plate code for header and argument handling...

#include <string>
#include <vector>
#include "FastaSequence.h"
#include "Utilities.h"
#include "Parsers.h"
#include "Encoders.h"
 
using namespace PCP;
 
/* Create a set of features for one protein in SVMlight format.
 * The two classes are Helix(-1) and Non-Helix(+1)
 */
 
int main(int argc, const char *argv[]) {
 
  if(argc != 4) {
    Fail("Usage: SS-train <fasta> <pssm> <dssp>");
  }
  std::string fasta_filename = argv[1];
  std::string pssm_filename = argv[2];
  std::string dssp_filename = argv[3];
 

2. Read in and then parse FASTA sequence file...

  // Read in sequence, check that one was read
  std::vector<std::string> lines;
  std::vector<FastaSequence> sequences;
  int sequence_count = 0;
  ReadFile(fasta_filename, lines);
  sequence_count = ParseFastaSequences(lines, sequences);
  if(sequence_count == 0) {
    Fail("No sequence was read from file " + fasta_filename);
  }
  FastaSequence primary_sequence = sequences[0];
 

3. Read in and then parse PSSM and DSSP files...

  // Read in and parse pssm
  lines.clear();
  std::vector< std::vector<double> > pssm;
  std::vector<double> information;
  ReadFile(pssm_filename, lines);
  ParseAsciiPSSM(lines, pssm, information, true);
 
  // Read in dssp
  lines.clear();
  std::string dssp_sequence;
  std::string true_ss;
  ReadFile(dssp_filename, lines);
  ParseDSSPOutput(lines, dssp_sequence, true_ss);
 

4. Create a sliding window to move across the sequence. Then for each window encode the Amino Acid sequence and PSSM related information. This gets appended to a feature vector. Then print out the vector. Since this output will be used with SVMlight, we set IncudeFeatureNum to true and start numbering the features from '1'...

  // Generate features for a sliding window of 9 residues
  // running from the 5th residue to length-5
  std::vector<double> features;
 
  for(unsigned int center = 4; center < sequences[0].Length() - 5; ++center) {
    features.clear();
 
    // For each residue in the window, push features on end of 
    // Feature vector
    for(unsigned int index = center - 4; index <= center + 4; ++index) {
      HotEncodeAA(primary_sequence[index], features);
      features.insert(features.end(), pssm[index].begin(), pssm[index].end());
    }
 
    // Write out target
    if(true_ss[center] == 'H') {
      std::cout << "-1 ";
    } else {
      std::cout << "+1 ";
    }
 
    // Follow up by writing out features
    PrintFeatures(features, true, 1);
 
  }
 

Source listing for SS-train.cpp

You can compile and test the SS-train program yourself using the Makefile included in the root directory.