libsequence  1.9.5
Alignment.tcc
1 // Code for the -*- C++ -*- namespace Sequence::Alignment
2 
3 /*
4 
5 Copyright (C) 2003-2009 Kevin Thornton, krthornt[]@[]uci.edu
6 
7 Remove the brackets to email me.
8 
9 This file is part of libsequence.
10 
11 libsequence is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
15 
16 libsequence is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20 
21 You should have received a copy of the GNU General Public License
22 long with libsequence. If not, see <http://www.gnu.org/licenses/>.
23 
24 */
25 
26 /*! \file Alignment.tcc
27  The static assertion that is used throughout this file
28  ensures that the template type parameter T is either
29  std::pair<std::string,std::string> or derived from that type,
30  which includes Sequence::Seq (see Sequence/Seq.hpp).
31  @brief implementation of routines declared in Alignment.hpp
32 */
33 #include <Sequence/Alignment.hpp>
34 #include <Sequence/SeqConstants.hpp>
35 #include <Sequence/SeqAlphabets.hpp>
36 #include <type_traits>
37 #include <iterator>
38 #include <algorithm>
39 #include <iostream>
40 namespace Sequence
41 {
42  namespace Alignment
43  {
44  template <typename T>
45  void
46  GetData(std::vector<T> &seqarray, const char *infilename)
47  /*!
48  Read objects of type T and put them into the vector seqarray.
49  Note that seqarray is not const, so that's where the data go.
50  infilename refers to a file from which to read the data. Any
51  exceptions that can be thrown when reading the data are not
52  caught here.
53  \n
54  Example:\n
55  \code
56  #include <iostream>
57  #include <Sequence/Fasta.hpp>
58  #include <Sequence/Alignment.hpp>
59  //using namespace std;
60  using namespace Sequence;
61  int main
62  {
63  vector<Fasta> data;
64  const char *infile = foo;
65  try {
66  Alignment::GetData(&data,foo);
67  } catch (std::runtime_error &e) {
68  cerr << "whoa--exception caught"<<endl;
69  e.print(cerr);
70  cerr << endl;
71  }
72  }
73  \endcode
74  \param seqarray a vector<T> that you want filled
75  \param infilename name of file from which fo fill seqarray
76  \note if \a infilename is NULL, the function returns, having done nothing
77  */
78  {
79  if (infilename == nullptr)
80  return;
81 
82  std::ifstream infile(infilename);
83  if (infile)
84  {
85  T temp;
86  while (!(infile.eof()))
87  {
88  infile >> temp >> std::ws;
89  seqarray.push_back(temp);
90  }
91  }
92  }
93 
94  template <typename T>
95  std::istream &
96  GetData(std::vector<T> &seqarray, std::istream &input_stream)
97  /*!
98  Read objects of type T and put them into the vector seqarray.
99  Note that seqarray is not const, so that's where the data go.
100  This function is similar to GetData(vector<T> &seqarray,
101  const char *infilename), except that it is passed a reference
102  to an open input stream, such as a file stream, cin, etc.
103  \param seqarray an empty vector<T> that you want filled
104  \param input_stream name of istream from which fo fill seqarray
105  */
106  {
107  T temp;
108  if (input_stream)
109  {
110  while (!(input_stream.eof()))
111  {
112  input_stream >> temp >> std::ws;
113  seqarray.push_back(temp);
114  }
115  }
116  return input_stream;
117  }
118 
119  template <typename T>
120  std::istream &
121  ReadNObjects(std::vector<T> &seqarray, const unsigned &n,
122  std::istream &input_stream)
123  /*!
124  Read a fixed number n of objects of type T and put
125  them into the vector seqarray.
126  Note that seqarray is not const, so that's where the data go.
127  This function is similar to GetData(vector<T> &seqarray,
128  const char *infilename), except that it is passed a reference
129  to an open input stream, such as a file stream, cin, etc.
130  \param seqarray an empty vector<T> that you want filled
131  \param n number of objects of type T to read
132  \param input_stream name of istream from which fo fill seqarray
133  \note The stream is not closed after each read of n records, and
134  seqarray is appended to, and thus grows each time.
135  */
136  {
137  for (unsigned i = 0; !input_stream.eof() && i < n; ++i)
138  {
139  if (!input_stream || input_stream.eof())
140  return input_stream;
141  T temp;
142  input_stream >> temp >> std::ws;
143  seqarray.push_back(temp);
144  }
145  return input_stream;
146  }
147 
148  template <typename T>
149  bool
150  Gapped(const std::vector<T> &data)
151  /*!
152  \param data vector<T> containing sequence data
153  \return true if the vector contains a gap character ('-')
154  , false otherwise.
155  */
156  {
157  for (typename std::vector<T>::size_type i = 0; i < data.size();
158  ++i)
159  //iterate over sequences
160  {
161  if (data[i].seq.find('-') != std::string::npos)
162  return true;
163  }
164  return false;
165  }
166 
167  template <typename T>
168  bool
169  IsAlignment(const std::vector<T> &data)
170  /*!
171  A vector of sequences/strings is only an alignment if all
172  strings are the same length.
173  \param data vector<T> to check
174  */
175  {
176  for (typename std::vector<T>::size_type i = 0; i < data.size();
177  ++i)
178  {
179  if (data[i].seq.length() != data[0].seq.length())
180  {
181  return 0;
182  }
183  }
184 
185  return 1;
186  }
187 
188  template <typename Iterator>
189  bool
190  validForPolyAnalysis(Iterator beg, Iterator end)
191  /*!
192  \return true if each element in the range [beg,end) only contains
193  characters in the set {A,G,C,T,N,-}, false otherwise
194  */
195  {
196  while (beg < end)
197  {
198  if (std::find_if(beg->seq.begin(), beg->seq.end(),
199  Sequence::invalidPolyChar())
200  != beg->seq.end())
201  {
202  return false;
203  }
204  ++beg;
205  }
206  return true;
207  }
208 
209  template <typename T>
210  unsigned
211  UnGappedLength(const std::vector<T> &data)
212 
213  /*!
214  Returns the number of sites in the alignment for which
215  all objects do not contain the gap character '-'. If the data are not
216  aligned, the value Sequence::SEQMAXUNSIGNED is returned as an error.
217  \param data vector<T> to check
218  */
219  {
220  unsigned len = 0;
221  if (!IsAlignment(data))
222  return Sequence::SEQMAXUNSIGNED;
223 
224  for (size_t j = 0; j < data[0].seq.length(); ++j)
225  {
226  bool site_gapped = 0;
227  for (size_t i = 0; i < data.size(); ++i)
228  {
229  if (data[i][j] == '-')
230  {
231  site_gapped = 1;
232  i = data.size();
233  }
234  }
235  if (!(site_gapped))
236  ++len;
237  }
238  return len;
239  }
240 
241  template <typename T>
242  void
243  RemoveGaps(std::vector<T> &data)
244  /*!
245  Modifies the data vector to remove all positions that
246  contain the gap character'-'.
247  \param data vector<T> to modify
248  */
249  {
250  size_t i, j, datasize = data.size();
251  size_t length = data[0].seq.length();
252  std::vector<std::string> ungapped_sequences(data.size());
253  bool site_is_gapped;
254 
255  for (i = 0; i < length; ++i)
256  { //iterate over sites
257  for (j = 0, site_is_gapped = 0; j < datasize; ++j)
258  {
259  if (data[j].seq[i] == '-')
260  {
261  site_is_gapped = 1;
262  j = datasize;
263  }
264  }
265  if (!(site_is_gapped))
266  {
267  for (j = 0; j != data.size(); ++j)
268  ungapped_sequences[j] += data[j].seq[i];
269  }
270  }
271 
272  //redo the data
273  for (j = 0; j != datasize; ++j)
274  {
275  data[j].seq = std::move(ungapped_sequences[j]);
276  }
277  }
278 
279  //only remove gaps from the beginning
280  //and end of an alignment
281  template <typename T>
282  void
283  RemoveTerminalGaps(std::vector<T> &data)
284  /*!
285  Remove all gapped sites from the ends of the alignment,
286  up until the name site on either side that is ungapped.
287  \param data vector<T> to modify
288  */
289  {
290  size_t i, j,
291  length
292  = data[0].seq.length(); //how much we have to iterate over
293  std::vector<std::string> trimmed_sequences;
294  size_t leftmost, rightmost, numUngapped;
295  size_t offset;
296  size_t size = data.size();
297 
298  leftmost = SEQMAXUNSIGNED;
299  rightmost = length + 1; //offset by one b/c its an array...
300 
301  //find the leftmost site where all sites in the alignment are ungapped
302  for (i = 0; i < length; ++i)
303  { //iterate over sites
304  for (numUngapped = 0, j = 0; j != data.size(); ++j)
305  {
306  if (data[j].seq[i] != '-')
307  ++numUngapped;
308  }
309  if (numUngapped == size)
310  {
311  leftmost = i;
312  i = length + 1;
313  }
314  }
315 
316  //find the rigthmost site where all sites in the alignment are ungapped
317  bool exit_condition = false;
318  for (i = length - 1;
319  i < data[0].seq.length() && exit_condition == false; --i)
320  {
321  for (numUngapped = 0, j = 0; j != data.size(); ++j)
322  {
323  if (data[j].seq[i] != '-')
324  ++numUngapped;
325  }
326  if (numUngapped == size)
327  {
328  rightmost = i;
329  exit_condition = true;
330  }
331  }
332 
333  //now, fill the array of trimmed sequences
334  offset = rightmost - leftmost + 1;
335  for (j = 0; j != data.size(); ++j)
336  trimmed_sequences.push_back(
337  data[j].seq.substr(leftmost, offset));
338 
339  //now, redo the seq array for the current object
340  for (j = 0; j != data.size(); ++j)
341  {
342  //Probably can't hurt...
343  data[j] = std::move(T(data[j].name, trimmed_sequences[j]));
344  }
345  }
346 
347  template <typename T>
348  void
349  RemoveFixedOutgroupInsertions(std::vector<T> &data,
350  const unsigned &ref)
351  /*!
352  Removes all positions from data that for which the outgroup
353  contains an insertion relative to ingroup
354  @param data a vector of Seq objects
355  @param site index of the site at which to begin (set to 0 usually)
356  @param ref the index of the outgroup in @a data
357  */
358  {
359  size_t nsam = data.size() - 1;
360  size_t s = data[0].seq.length() - 1, max = s + 1;
361  for (size_t i = 0; i < max; --s, ++i)
362  {
363  unsigned ngap = 0;
364  for (size_t ind = 0; ind < data.size(); ++ind)
365  {
366  if (ind != ref && data[ind].seq[s] == '-')
367  {
368  ngap++;
369  }
370  }
371  if (ngap == nsam)
372  {
373  for (unsigned ind = 0; ind < data.size(); ++ind)
374  {
375  data[ind].seq.erase(s, 1);
376  }
377  }
378  }
379  }
380 
381  template <typename T>
382  std::vector<T>
383  Trim(const std::vector<T> &data, const std::vector<int> &sites)
384 
385  /*!
386  Returns a copy of the data vector, modified in the following way. The sites vector
387  contains an even number of sites (whose values are sorted). If sites
388  does not contain an even number of values std::runtime_error is thrown.
389  If sites is empty, std::runtime_error is thrown. The values in sites
390  represent a series of intervals that you wish to keep, and the return vector is
391  consists only of those--i.e. all positions not present
392  in the intervals defined in sites are lost. For example, if you pass a
393  vector<int> containing the values 0,10,21, and 30, then the data vector is modified
394  so that positions 0 through 10 and 21 through 30 are all that remains.
395  One intended use of this function is to pull, for example, the coding region
396  out of an aligned block.
397  \param data the original data
398  \param sites vector<int> containing an even number of integers
399  specifying the intervals of data to keep
400  \exception std::runtime_error
401  */
402  {
403  size_t i, j, numseqs = data.size(), numIntervals = sites.size();
404  std::vector<T> trimmedData(numseqs);
405  std::vector<std::string> trimmedTemp(numseqs);
406  if (sites.empty())
407  {
408  throw std::runtime_error("Sequence::Alignment::Trim(): "
409  "empty vector of positions "
410  "passed to function");
411  }
412  if (numIntervals % 2 != 0)
413  {
414  throw std::runtime_error("Sequence::Alignment::Trim(): "
415  "odd number of positions passed");
416  }
417 
418  for (i = 0; i < numIntervals; i += 2)
419  {
420  unsigned start = unsigned(sites[i]);
421  unsigned stop = unsigned(sites[i + 1]);
422  for (j = 0; j < numseqs; ++j)
423  {
424  trimmedTemp[j]
425  += data[j].seq.substr(start, stop - start + 1);
426  }
427  }
428 
429  for (i = 0; i < numseqs; ++i)
430  {
431  trimmedData[i].name = data[i].name;
432  trimmedData[i].seq = std::move(trimmedTemp[i]);
433  }
434 
435  return trimmedData;
436  }
437 
438  template <typename T>
439  std::vector<T>
440  TrimComplement(const std::vector<T> &data,
441  const std::vector<int> &sites)
442 
443  /*!
444  Returns a copy the data vector, modified in the following way. The sites vector
445  contains an even number of sites (whose values are sorted). If sites
446  does not contain an even number of values std::runtime_error is thrown.
447  If sites is empty, std::runtime_error is thrown. The values in sites
448  represent a series of intervals that you wish to keep, and the return vector
449  consists only of sites not present in \a sites--i.e. all positions not present
450  in the intervals defined in sites are kept. For example, if you pass a
451  vector<int> containing the values 0,10,21, and 30, then the data vector is modified
452  so that positions 11 through 20 and 31 through the end of the sequences
453  are all that remains.
454  \param data the original data
455  \param sites vector<int> containing an even number of integers
456  specifying the intervals of data to throw away
457  \exception std::runtime_error
458  */
459  {
460  std::vector<int> newSites;
461  size_t i, j, numseqs = data.size(), numIntervals = sites.size(),
462  lastval;
463 
464  if (sites.empty())
465  {
466  throw std::runtime_error(
467  "Sequence::Alignment::TrimComplement(): empty vector "
468  "of positions passed to function");
469  }
470  if (numIntervals % 2 != 0)
471  {
472  throw std::runtime_error("Sequence::Alignment::"
473  "TrimComplement(): odd numer of "
474  "positions passed to function");
475  }
476 
477  std::vector<T> trimmedData(numseqs);
478  std::vector<std::string> trimmedTemp(numseqs);
479 
480  size_t odd_even;
481  if (sites[0] == 0)
482  {
483  for (i = 1; i < numIntervals; ++i)
484  {
485  odd_even = i + 1;
486  if (odd_even % 2 == 0)
487  {
488  newSites.push_back(sites[i] + 1);
489  }
490  else if (odd_even % 2 != 0)
491  {
492  newSites.push_back(sites[i] - 1);
493  }
494  }
495  }
496  else if (sites[0] > 0)
497  {
498  newSites.push_back(0);
499  for (i = 0; i < numIntervals; ++i)
500  {
501  odd_even = i + 1;
502  if (odd_even % 2 == 0)
503  {
504  newSites.push_back(sites[i] + 1);
505  }
506  else if (odd_even % 2 != 0)
507  {
508  newSites.push_back(sites[i] - 1);
509  }
510  }
511  }
512 
513  lastval = size_t(newSites[newSites.size() - 1]);
514  newSites.pop_back();
515  numIntervals = newSites.size();
516  for (i = 0; i < numIntervals; i += 2)
517  {
518  auto start = size_t(newSites[i]);
519  auto stop = size_t(newSites[i + 1]);
520  for (j = 0; j < numseqs; ++j)
521  {
522  trimmedTemp[j]
523  += data[j].seq.substr(start, stop - start + 1);
524  }
525  }
526 
527  for (j = 0; j < numseqs; ++j)
528  {
529  trimmedTemp[j] += data[j].seq.substr(lastval);
530  }
531 
532  for (i = 0; i < numseqs; ++i)
533  {
534  trimmedData[i].name = data[i].name;
535  trimmedData[i].seq = std::move(trimmedTemp[i]);
536  }
537 
538  return trimmedData;
539  }
540  }
541 }