libsequence  1.9.5
PolySites.tcc
1 //Code for the -*- C++ -*- namespace Sequence::PolSites template members
2 
3 /*
4 
5 Copyright (C) 2003-2009 Kevin Thornton, krthornt[]@[]uci.edu
6 
7 Remove the brackets to email me.
8 
9 This file is part of libsequence.
10 
11 libsequence is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
15 
16 libsequence is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20 
21 You should have received a copy of the GNU General Public License
22 long with libsequence. If not, see <http://www.gnu.org/licenses/>.
23 
24 */
25 
26 // Code for the -*- C++ -*- namespace Sequence::PolySites template members
27 #include <Sequence/stateCounter.hpp>
28 #include <Sequence/SeqConstants.hpp>
29 #include <Sequence/Seq.hpp>
30 #include <Sequence/Alignment.hpp>
31 #include <cctype>
32 #include <type_traits>
33 namespace
34 {
35  /*!
36  It is often the case that alignment programs replace characters
37  at a site with a period (.) if that character is the same as that
38  found at that position in the first sequence in the alignment.
39  This variable is used by Sequence::PolySites when turning aligned
40  data into a polymorphism table.
41  */
42  const char IDENTICAL = '.';
43 }//anon namespace
44 
45 namespace Sequence
46 {
47  template<typename __DataType>
48  PolySites::PolySites (const std::vector < __DataType >&alignment,
49  bool strictInfSites , bool ignoregaps,
50  bool skipMissing, bool skipAdjSNP,
51  unsigned freqfilter):
52  PolyTable()
53  /*!
54  This is the constructor if you are using "string-like" data, such as std::string, or
55  Sequence::Fasta. Note that the vector name is
56  aligment, and that means that every sequence had better be the same length!\n
57  \n
58  By default, there is no limit to how many characters can "segregate" at a
59  variable position, although if there are more than 4, most biologists will
60  start to worry. There are, however, times when you may wish to onlu consider
61  sites that have a total of 2 character states. (NOTE: by two states, I mean
62  including BOTH the ingroup and the outgroup sequence.) Setting strictInfSites
63  to 1 will result in making a polymorphic sites object containing only sites with
64  2 states.\n
65  \param alignment vector of data
66  \param strictInfSites if \c true, throw out all sites with > 2 states
67  \param ignoregaps if \c true, do not count gapped sites as polymorphisms
68  \param skipMissing if \c true, ignore ALL sites with missing data ('N')
69  \param skipAdjSNP if \c does nothing. a placeholder for a future feature
70  \param freqfilter Defaults to 0. For a polymorphic site to be included in the
71  final table, the minor allele count in the data (i.e. the number of times
72  the minor allele occurs at that site) must be strictly greater than
73  \c freqfilter
74  \note segsite positions are stored as positions (starting from 1)
75  \warning when ignoregaps=false, this class does not do the right thing
76  */
77  {
78  static_assert( std::is_same<__DataType,std::string>::value ||
79  std::is_base_of<Sequence::Seq,__DataType>::value,
80  "__DataType must be std::string or derived from Sequence::Seq");
81  if (Alignment::IsAlignment(alignment) == true)
82  {
83  fillIt<__DataType> (alignment, strictInfSites, ignoregaps,skipMissing,freqfilter);
84  if(skipAdjSNP == true)
85  {}
86  }
87  }
88 
89  template<typename __DataType>
90  void
91  PolySites::fillIt (const std::vector < __DataType >&alignment, bool strict, bool ignoregaps,
92  bool skipMissing,unsigned freqfilter)
93  {
94  static_assert( std::is_same<__DataType,std::string>::value ||
95  std::is_base_of<Sequence::Seq,__DataType>::value,
96  "__DataType must be std::string or derived from Sequence::Seq");
97  decltype(alignment.size()) numseqs = alignment.size();
98  if(!numseqs) return;
99  std::vector<double> _positions;
100  decltype(alignment[0].length()) seqlen = alignment[0].length();
101  for(decltype(seqlen) j = 0 ; j < seqlen ; ++j)
102  {
103  stateCounter Counts;
104  bool haveMissing = false;
105  for(unsigned i = 0 ; i < numseqs ; ++i)
106  {
107  if (std::toupper(alignment[i][j]) == 'N'||
108  (alignment[i][j]==IDENTICAL && std::toupper(alignment[0][j]) == 'N'))
109  {
110  haveMissing = true;
111  }
112  if (alignment[i][j] == IDENTICAL)
113  {
114  Counts(alignment[0][j]);
115  }
116  else
117  {
118  Counts(alignment[i][j]);
119  }
120  }
121  if ( (ignoregaps == true && Counts.gap == 0) || ignoregaps == false )//skip gaps if necessary
122  {
123  if (skipMissing == false || (skipMissing == true && haveMissing == false))
124  {
125  unsigned numstates = Counts.nStates();
126  unsigned minor_count = SEQMAXUNSIGNED;
127  if ( freqfilter > 0 )
128  {
129  minor_count = (Counts.a > 0 && (Counts.a < minor_count ||
130  minor_count == SEQMAXUNSIGNED))
131  ? Counts.a : minor_count;
132  minor_count = (Counts.g > 0 && (Counts.g < minor_count ||
133  minor_count == SEQMAXUNSIGNED))
134  ? Counts.g : minor_count;
135  minor_count = (Counts.c > 0 && (Counts.c < minor_count ||
136  minor_count == SEQMAXUNSIGNED))
137  ? Counts.c : minor_count;
138  minor_count = (Counts.t > 0 && (Counts.t < minor_count ||
139  minor_count == SEQMAXUNSIGNED))
140  ? Counts.t : minor_count;
141  minor_count = (Counts.zero > 0 && (Counts.zero < minor_count ||
142  minor_count == SEQMAXUNSIGNED))
143  ? Counts.zero : minor_count;
144  minor_count = (Counts.one > 0 && (Counts.one < minor_count ||
145  minor_count == SEQMAXUNSIGNED))
146  ? Counts.one : minor_count;
147  }
148  else if (freqfilter == 0 )
149  minor_count = 1;
150 
151  if(strict == false && numstates > 1 && minor_count > freqfilter)
152  //site is variable
153  {
154  _positions.push_back(double(j+1));//add 1 to emulate a real positions
155  }
156  else if(strict == true && numstates == 2 && minor_count > freqfilter )
157  //if there are 2 and only 2 character states
158  {
159  _positions.push_back(double(j+1));//add 1 to emulate a real positions
160  }
161  }
162  }
163  }
164  std::vector<std::string> _data(numseqs);
165  for(unsigned i = 0 ; i < numseqs ; ++i)
166  {
167  for(unsigned j = 0 ; j < _positions.size() ; ++j)
168  {
169  if (alignment[i][unsigned(_positions[j])-1] == IDENTICAL)
170  {
171  _data[i] += alignment[0][unsigned(_positions[j])-1];
172  }
173  else
174  {
175  _data[i] += alignment[i][unsigned(_positions[j])-1];
176  }
177  }
178  }
179  PolyTable::assign(std::move(_positions),std::move(_data));
180  }
181 }