1 //Code for the -*- C++ -*- namespace Sequence::PolSites template members
5 Copyright (C) 2003-2009 Kevin Thornton, krthornt[]@[]uci.edu
7 Remove the brackets to email me.
9 This file is part of libsequence.
11 libsequence is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 libsequence is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 long with libsequence. If not, see <http://www.gnu.org/licenses/>.
26 // Code for the -*- C++ -*- namespace Sequence::PolySites template members
27 #include <Sequence/stateCounter.hpp>
28 #include <Sequence/SeqConstants.hpp>
29 #include <Sequence/Seq.hpp>
30 #include <Sequence/Alignment.hpp>
32 #include <type_traits>
36 It is often the case that alignment programs replace characters
37 at a site with a period (.) if that character is the same as that
38 found at that position in the first sequence in the alignment.
39 This variable is used by Sequence::PolySites when turning aligned
40 data into a polymorphism table.
42 const char IDENTICAL = '.';
47 template<typename __DataType>
48 PolySites::PolySites (const std::vector < __DataType >&alignment,
49 bool strictInfSites , bool ignoregaps,
50 bool skipMissing, bool skipAdjSNP,
54 This is the constructor if you are using "string-like" data, such as std::string, or
55 Sequence::Fasta. Note that the vector name is
56 aligment, and that means that every sequence had better be the same length!\n
58 By default, there is no limit to how many characters can "segregate" at a
59 variable position, although if there are more than 4, most biologists will
60 start to worry. There are, however, times when you may wish to onlu consider
61 sites that have a total of 2 character states. (NOTE: by two states, I mean
62 including BOTH the ingroup and the outgroup sequence.) Setting strictInfSites
63 to 1 will result in making a polymorphic sites object containing only sites with
65 \param alignment vector of data
66 \param strictInfSites if \c true, throw out all sites with > 2 states
67 \param ignoregaps if \c true, do not count gapped sites as polymorphisms
68 \param skipMissing if \c true, ignore ALL sites with missing data ('N')
69 \param skipAdjSNP if \c does nothing. a placeholder for a future feature
70 \param freqfilter Defaults to 0. For a polymorphic site to be included in the
71 final table, the minor allele count in the data (i.e. the number of times
72 the minor allele occurs at that site) must be strictly greater than
74 \note segsite positions are stored as positions (starting from 1)
75 \warning when ignoregaps=false, this class does not do the right thing
78 static_assert( std::is_same<__DataType,std::string>::value ||
79 std::is_base_of<Sequence::Seq,__DataType>::value,
80 "__DataType must be std::string or derived from Sequence::Seq");
81 if (Alignment::IsAlignment(alignment) == true)
83 fillIt<__DataType> (alignment, strictInfSites, ignoregaps,skipMissing,freqfilter);
84 if(skipAdjSNP == true)
89 template<typename __DataType>
91 PolySites::fillIt (const std::vector < __DataType >&alignment, bool strict, bool ignoregaps,
92 bool skipMissing,unsigned freqfilter)
94 static_assert( std::is_same<__DataType,std::string>::value ||
95 std::is_base_of<Sequence::Seq,__DataType>::value,
96 "__DataType must be std::string or derived from Sequence::Seq");
97 decltype(alignment.size()) numseqs = alignment.size();
99 std::vector<double> _positions;
100 decltype(alignment[0].length()) seqlen = alignment[0].length();
101 for(decltype(seqlen) j = 0 ; j < seqlen ; ++j)
104 bool haveMissing = false;
105 for(unsigned i = 0 ; i < numseqs ; ++i)
107 if (std::toupper(alignment[i][j]) == 'N'||
108 (alignment[i][j]==IDENTICAL && std::toupper(alignment[0][j]) == 'N'))
112 if (alignment[i][j] == IDENTICAL)
114 Counts(alignment[0][j]);
118 Counts(alignment[i][j]);
121 if ( (ignoregaps == true && Counts.gap == 0) || ignoregaps == false )//skip gaps if necessary
123 if (skipMissing == false || (skipMissing == true && haveMissing == false))
125 unsigned numstates = Counts.nStates();
126 unsigned minor_count = SEQMAXUNSIGNED;
127 if ( freqfilter > 0 )
129 minor_count = (Counts.a > 0 && (Counts.a < minor_count ||
130 minor_count == SEQMAXUNSIGNED))
131 ? Counts.a : minor_count;
132 minor_count = (Counts.g > 0 && (Counts.g < minor_count ||
133 minor_count == SEQMAXUNSIGNED))
134 ? Counts.g : minor_count;
135 minor_count = (Counts.c > 0 && (Counts.c < minor_count ||
136 minor_count == SEQMAXUNSIGNED))
137 ? Counts.c : minor_count;
138 minor_count = (Counts.t > 0 && (Counts.t < minor_count ||
139 minor_count == SEQMAXUNSIGNED))
140 ? Counts.t : minor_count;
141 minor_count = (Counts.zero > 0 && (Counts.zero < minor_count ||
142 minor_count == SEQMAXUNSIGNED))
143 ? Counts.zero : minor_count;
144 minor_count = (Counts.one > 0 && (Counts.one < minor_count ||
145 minor_count == SEQMAXUNSIGNED))
146 ? Counts.one : minor_count;
148 else if (freqfilter == 0 )
151 if(strict == false && numstates > 1 && minor_count > freqfilter)
154 _positions.push_back(double(j+1));//add 1 to emulate a real positions
156 else if(strict == true && numstates == 2 && minor_count > freqfilter )
157 //if there are 2 and only 2 character states
159 _positions.push_back(double(j+1));//add 1 to emulate a real positions
164 std::vector<std::string> _data(numseqs);
165 for(unsigned i = 0 ; i < numseqs ; ++i)
167 for(unsigned j = 0 ; j < _positions.size() ; ++j)
169 if (alignment[i][unsigned(_positions[j])-1] == IDENTICAL)
171 _data[i] += alignment[0][unsigned(_positions[j])-1];
175 _data[i] += alignment[i][unsigned(_positions[j])-1];
179 PolyTable::assign(std::move(_positions),std::move(_data));