libsequence  1.9.5
ufs.cc
1 /*
2  ufs.cc
3 
4  Calculate the polarized (unfolded) site frequency spectrum from a file containing
5  aligned sequences in Fasta format.
6 
7  Author: Kevin Thornton
8 */
9 
10 #include <Sequence/Fasta.hpp>
11 #include <Sequence/Alignment.hpp>
12 #include <Sequence/PolySites.hpp>
15 #include <algorithm>
16 #include <vector>
17 #include <iostream>
18 #include <functional>
19 #include <cctype>
20 
21 //so that you know where things come from
22 using std::map;
23 using std::vector;
24 using std::cout;
25 using std::cerr;
26 using Sequence::Fasta;
28 using Sequence::makeCountList; // <Sequence/SeqUtilities.hpp>
29 using Sequence::operator+=; // <Sequence/CountingOperators.hpp> defines an operator+= for maps
30 using Sequence::Alignment::GetData;
32 
33 bool validStates(const std::map<char,unsigned> & counts)
34 /*
35  counts is assumed to be a map of nucleotides and their number of occurences.
36  The function returns false if any of the nucleotides are not in the set {A,G,C,T,N},
37  and toupper is used so that matching is case-insensitive.
38  */
39 {
40  for( map<char,unsigned>::const_iterator i = counts.begin() ;
41  i != counts.end() ;
42  ++i )
43  {
44  char ch = toupper(i->first);
45  if ( ch != 'A' && ch != 'G' && ch != 'C' && ch != 'T' && ch != 'N' )
46  return false;
47  }
48  return true;
49 }
50 
51 int main(int argc, char **argv)
52 {
53  if(argc!=3)
54  {
55  cerr << "usage: ufs file outgroup_index\n";
56  exit(1);
57  }
58 
59  const char * fastafile = argv[1];
60  const unsigned outgroup = atoi(argv[2]);
61 
62  //read in alignment and make sure that all elements are the same length, otherwise exit
63  vector<Fasta> alignment;
64  GetData(alignment,fastafile);
65  if( ! IsAlignment(alignment) )
66  {
67  cerr << fastafile << " not aligned\n";
68  exit(1);
69  }
70 
71  //make a table of variable sites, excluding gaps and sites with > 2 states in the alignment
72  PolySites SNPtable(alignment,true); //the "true" eliminates all sites in alignment with > 2 states
73 
74  vector< unsigned > ufs(alignment.size()-1,0); //store the frequency spectrum
75 
76  //here, we use iterators to polymorphic sites to access columns in the SNP table
77  //these iterators are typedefs for std::pair<double,std::string>
78  for( PolySites::const_site_iterator i = SNPtable.sbegin() ;
79  i != SNPtable.send() ; ++i )
80  {
81  const char ancstate = i->second[outgroup];
82 
83  //now, we want to count up all the nucelotide states in the ingroup. we do this in 2 stages,
84  //so that we guarantee that we skip the outgroup. as the second element in this iterator
85  //is a std::string, normal STL range operations apply.
86 
87  //the function Sequence::makeCountList takes two iterators, of type ITR, (beg,end) as arguments,
88  //and counts all the elements in the range beg,end-1, returning a
89  //std::map< std::iterator_traits<ITR>::value_type, unsigned >. In this case, the
90  //value_type is a char, since the iterators are those of std::string
91  std::map<char,unsigned> counts = makeCountList(i->second.begin(),i->second.begin()+outgroup);
92 
93  //here is a trick. std::map does not have an operator+=, but <Sequence/CountingOperators.hpp>
94  //provides template to define these operators, which is useful for counting operations like this
95  counts += makeCountList(i->second.begin()+outgroup+1,i->second.end());
96 
97  if( ! validStates(counts) )
98  {
99  cerr << "site " << i->first << " contains characters other than A,G,C,T,N\n";
100  }
101  else
102  {
103  for( map<char,unsigned>::const_iterator i = counts.begin() ;
104  i != counts.end() ;
105  ++i )
106  {
107  if ( toupper(i->first) != ancstate )
108  {
109  ufs[i->second-1]++;
110  }
111  }
112  }
113  }
114 
115  //output the unfolded site frequency spectrum.
116  for(unsigned i=0;i<ufs.size();++i)
117  {
118  cout << i+1 << '\t' << ufs[i] << '\n';
119  }
120  return 0;
121 }
Polymorphism tables for sequence data.
Definition: PolySites.hpp:33
FASTA sequence stream.
Definition: Fasta.hpp:49
bool IsAlignment(const std::vector< std::string > &data)
Declaration of namespace Sequence::Alignment.
Declaration of Sequence::makeCountList (an alternative to Sequence::stateCounter), Sequence::internalGapCheck.
Sequence::PolySites, generates polymorphism tables from data.
Declaration of Sequence::Fasta streams.
std::map< typename std::iterator_traits< Iterator >::value_type, unsigned > makeCountList(Iterator beg, Iterator end)
Declarations of operators to add associative containers together.