1 // Code for the -*- C++ -*- namespace Sequence::Alignment
5 Copyright (C) 2003-2009 Kevin Thornton, krthornt[]@[]uci.edu
7 Remove the brackets to email me.
9 This file is part of libsequence.
11 libsequence is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 libsequence is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 long with libsequence. If not, see <http://www.gnu.org/licenses/>.
26 /*! \file Alignment.tcc
27 The static assertion that is used throughout this file
28 ensures that the template type parameter T is either
29 std::pair<std::string,std::string> or derived from that type,
30 which includes Sequence::Seq (see Sequence/Seq.hpp).
31 @brief implementation of routines declared in Alignment.hpp
33 #include <Sequence/Alignment.hpp>
34 #include <Sequence/SeqConstants.hpp>
35 #include <Sequence/SeqAlphabets.hpp>
36 #include <type_traits>
46 GetData(std::vector<T> &seqarray, const char *infilename)
48 Read objects of type T and put them into the vector seqarray.
49 Note that seqarray is not const, so that's where the data go.
50 infilename refers to a file from which to read the data. Any
51 exceptions that can be thrown when reading the data are not
57 #include <Sequence/Fasta.hpp>
58 #include <Sequence/Alignment.hpp>
59 //using namespace std;
60 using namespace Sequence;
64 const char *infile = foo;
66 Alignment::GetData(&data,foo);
67 } catch (std::runtime_error &e) {
68 cerr << "whoa--exception caught"<<endl;
74 \param seqarray a vector<T> that you want filled
75 \param infilename name of file from which fo fill seqarray
76 \note if \a infilename is NULL, the function returns, having done nothing
79 if (infilename == nullptr)
82 std::ifstream infile(infilename);
86 while (!(infile.eof()))
88 infile >> temp >> std::ws;
89 seqarray.push_back(temp);
96 GetData(std::vector<T> &seqarray, std::istream &input_stream)
98 Read objects of type T and put them into the vector seqarray.
99 Note that seqarray is not const, so that's where the data go.
100 This function is similar to GetData(vector<T> &seqarray,
101 const char *infilename), except that it is passed a reference
102 to an open input stream, such as a file stream, cin, etc.
103 \param seqarray an empty vector<T> that you want filled
104 \param input_stream name of istream from which fo fill seqarray
110 while (!(input_stream.eof()))
112 input_stream >> temp >> std::ws;
113 seqarray.push_back(temp);
119 template <typename T>
121 ReadNObjects(std::vector<T> &seqarray, const unsigned &n,
122 std::istream &input_stream)
124 Read a fixed number n of objects of type T and put
125 them into the vector seqarray.
126 Note that seqarray is not const, so that's where the data go.
127 This function is similar to GetData(vector<T> &seqarray,
128 const char *infilename), except that it is passed a reference
129 to an open input stream, such as a file stream, cin, etc.
130 \param seqarray an empty vector<T> that you want filled
131 \param n number of objects of type T to read
132 \param input_stream name of istream from which fo fill seqarray
133 \note The stream is not closed after each read of n records, and
134 seqarray is appended to, and thus grows each time.
137 for (unsigned i = 0; !input_stream.eof() && i < n; ++i)
139 if (!input_stream || input_stream.eof())
142 input_stream >> temp >> std::ws;
143 seqarray.push_back(temp);
148 template <typename T>
150 Gapped(const std::vector<T> &data)
152 \param data vector<T> containing sequence data
153 \return true if the vector contains a gap character ('-')
157 for (typename std::vector<T>::size_type i = 0; i < data.size();
159 //iterate over sequences
161 if (data[i].seq.find('-') != std::string::npos)
167 template <typename T>
169 IsAlignment(const std::vector<T> &data)
171 A vector of sequences/strings is only an alignment if all
172 strings are the same length.
173 \param data vector<T> to check
176 for (typename std::vector<T>::size_type i = 0; i < data.size();
179 if (data[i].seq.length() != data[0].seq.length())
188 template <typename Iterator>
190 validForPolyAnalysis(Iterator beg, Iterator end)
192 \return true if each element in the range [beg,end) only contains
193 characters in the set {A,G,C,T,N,-}, false otherwise
198 if (std::find_if(beg->seq.begin(), beg->seq.end(),
199 Sequence::invalidPolyChar())
209 template <typename T>
211 UnGappedLength(const std::vector<T> &data)
214 Returns the number of sites in the alignment for which
215 all objects do not contain the gap character '-'. If the data are not
216 aligned, the value Sequence::SEQMAXUNSIGNED is returned as an error.
217 \param data vector<T> to check
221 if (!IsAlignment(data))
222 return Sequence::SEQMAXUNSIGNED;
224 for (size_t j = 0; j < data[0].seq.length(); ++j)
226 bool site_gapped = 0;
227 for (size_t i = 0; i < data.size(); ++i)
229 if (data[i][j] == '-')
241 template <typename T>
243 RemoveGaps(std::vector<T> &data)
245 Modifies the data vector to remove all positions that
246 contain the gap character'-'.
247 \param data vector<T> to modify
250 size_t i, j, datasize = data.size();
251 size_t length = data[0].seq.length();
252 std::vector<std::string> ungapped_sequences(data.size());
255 for (i = 0; i < length; ++i)
256 { //iterate over sites
257 for (j = 0, site_is_gapped = 0; j < datasize; ++j)
259 if (data[j].seq[i] == '-')
265 if (!(site_is_gapped))
267 for (j = 0; j != data.size(); ++j)
268 ungapped_sequences[j] += data[j].seq[i];
273 for (j = 0; j != datasize; ++j)
275 data[j].seq = std::move(ungapped_sequences[j]);
279 //only remove gaps from the beginning
280 //and end of an alignment
281 template <typename T>
283 RemoveTerminalGaps(std::vector<T> &data)
285 Remove all gapped sites from the ends of the alignment,
286 up until the name site on either side that is ungapped.
287 \param data vector<T> to modify
292 = data[0].seq.length(); //how much we have to iterate over
293 std::vector<std::string> trimmed_sequences;
294 size_t leftmost, rightmost, numUngapped;
296 size_t size = data.size();
298 leftmost = SEQMAXUNSIGNED;
299 rightmost = length + 1; //offset by one b/c its an array...
301 //find the leftmost site where all sites in the alignment are ungapped
302 for (i = 0; i < length; ++i)
303 { //iterate over sites
304 for (numUngapped = 0, j = 0; j != data.size(); ++j)
306 if (data[j].seq[i] != '-')
309 if (numUngapped == size)
316 //find the rigthmost site where all sites in the alignment are ungapped
317 bool exit_condition = false;
319 i < data[0].seq.length() && exit_condition == false; --i)
321 for (numUngapped = 0, j = 0; j != data.size(); ++j)
323 if (data[j].seq[i] != '-')
326 if (numUngapped == size)
329 exit_condition = true;
333 //now, fill the array of trimmed sequences
334 offset = rightmost - leftmost + 1;
335 for (j = 0; j != data.size(); ++j)
336 trimmed_sequences.push_back(
337 data[j].seq.substr(leftmost, offset));
339 //now, redo the seq array for the current object
340 for (j = 0; j != data.size(); ++j)
342 //Probably can't hurt...
343 data[j] = std::move(T(data[j].name, trimmed_sequences[j]));
347 template <typename T>
349 RemoveFixedOutgroupInsertions(std::vector<T> &data,
352 Removes all positions from data that for which the outgroup
353 contains an insertion relative to ingroup
354 @param data a vector of Seq objects
355 @param site index of the site at which to begin (set to 0 usually)
356 @param ref the index of the outgroup in @a data
359 size_t nsam = data.size() - 1;
360 size_t s = data[0].seq.length() - 1, max = s + 1;
361 for (size_t i = 0; i < max; --s, ++i)
364 for (size_t ind = 0; ind < data.size(); ++ind)
366 if (ind != ref && data[ind].seq[s] == '-')
373 for (unsigned ind = 0; ind < data.size(); ++ind)
375 data[ind].seq.erase(s, 1);
381 template <typename T>
383 Trim(const std::vector<T> &data, const std::vector<int> &sites)
386 Returns a copy of the data vector, modified in the following way. The sites vector
387 contains an even number of sites (whose values are sorted). If sites
388 does not contain an even number of values std::runtime_error is thrown.
389 If sites is empty, std::runtime_error is thrown. The values in sites
390 represent a series of intervals that you wish to keep, and the return vector is
391 consists only of those--i.e. all positions not present
392 in the intervals defined in sites are lost. For example, if you pass a
393 vector<int> containing the values 0,10,21, and 30, then the data vector is modified
394 so that positions 0 through 10 and 21 through 30 are all that remains.
395 One intended use of this function is to pull, for example, the coding region
396 out of an aligned block.
397 \param data the original data
398 \param sites vector<int> containing an even number of integers
399 specifying the intervals of data to keep
400 \exception std::runtime_error
403 size_t i, j, numseqs = data.size(), numIntervals = sites.size();
404 std::vector<T> trimmedData(numseqs);
405 std::vector<std::string> trimmedTemp(numseqs);
408 throw std::runtime_error("Sequence::Alignment::Trim(): "
409 "empty vector of positions "
410 "passed to function");
412 if (numIntervals % 2 != 0)
414 throw std::runtime_error("Sequence::Alignment::Trim(): "
415 "odd number of positions passed");
418 for (i = 0; i < numIntervals; i += 2)
420 unsigned start = unsigned(sites[i]);
421 unsigned stop = unsigned(sites[i + 1]);
422 for (j = 0; j < numseqs; ++j)
425 += data[j].seq.substr(start, stop - start + 1);
429 for (i = 0; i < numseqs; ++i)
431 trimmedData[i].name = data[i].name;
432 trimmedData[i].seq = std::move(trimmedTemp[i]);
438 template <typename T>
440 TrimComplement(const std::vector<T> &data,
441 const std::vector<int> &sites)
444 Returns a copy the data vector, modified in the following way. The sites vector
445 contains an even number of sites (whose values are sorted). If sites
446 does not contain an even number of values std::runtime_error is thrown.
447 If sites is empty, std::runtime_error is thrown. The values in sites
448 represent a series of intervals that you wish to keep, and the return vector
449 consists only of sites not present in \a sites--i.e. all positions not present
450 in the intervals defined in sites are kept. For example, if you pass a
451 vector<int> containing the values 0,10,21, and 30, then the data vector is modified
452 so that positions 11 through 20 and 31 through the end of the sequences
453 are all that remains.
454 \param data the original data
455 \param sites vector<int> containing an even number of integers
456 specifying the intervals of data to throw away
457 \exception std::runtime_error
460 std::vector<int> newSites;
461 size_t i, j, numseqs = data.size(), numIntervals = sites.size(),
466 throw std::runtime_error(
467 "Sequence::Alignment::TrimComplement(): empty vector "
468 "of positions passed to function");
470 if (numIntervals % 2 != 0)
472 throw std::runtime_error("Sequence::Alignment::"
473 "TrimComplement(): odd numer of "
474 "positions passed to function");
477 std::vector<T> trimmedData(numseqs);
478 std::vector<std::string> trimmedTemp(numseqs);
483 for (i = 1; i < numIntervals; ++i)
486 if (odd_even % 2 == 0)
488 newSites.push_back(sites[i] + 1);
490 else if (odd_even % 2 != 0)
492 newSites.push_back(sites[i] - 1);
496 else if (sites[0] > 0)
498 newSites.push_back(0);
499 for (i = 0; i < numIntervals; ++i)
502 if (odd_even % 2 == 0)
504 newSites.push_back(sites[i] + 1);
506 else if (odd_even % 2 != 0)
508 newSites.push_back(sites[i] - 1);
513 lastval = size_t(newSites[newSites.size() - 1]);
515 numIntervals = newSites.size();
516 for (i = 0; i < numIntervals; i += 2)
518 auto start = size_t(newSites[i]);
519 auto stop = size_t(newSites[i + 1]);
520 for (j = 0; j < numseqs; ++j)
523 += data[j].seq.substr(start, stop - start + 1);
527 for (j = 0; j < numseqs; ++j)
529 trimmedTemp[j] += data[j].seq.substr(lastval);
532 for (i = 0; i < numseqs; ++i)
534 trimmedData[i].name = data[i].name;
535 trimmedData[i].seq = std::move(trimmedTemp[i]);