libsequence  1.9.5
Comeron95.cc
1 /*
2 
3  Copyright (C) 2003-2009 Kevin Thornton, krthornt[]@[]uci.edu
4 
5  Remove the brackets to email me.
6 
7  This file is part of libsequence.
8 
9  libsequence is free software: you can redistribute it and/or modify
10  it under the terms of the GNU General Public License as published by
11  the Free Software Foundation, either version 3 of the License, or
12  (at your option) any later version.
13 
14  libsequence is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  GNU General Public License for more details.
18 
19  You should have received a copy of the GNU General Public License
20  long with libsequence. If not, see <http://www.gnu.org/licenses/>.
21 
22 */
23 
24 #include <cmath>
25 #include <cfloat>
26 #include <cctype>
27 #include <memory>
28 #include <cassert>
29 #include <limits>
30 #include <algorithm>
31 #include <Sequence/Seq.hpp>
33 #include <Sequence/Comparisons.hpp>
34 #include <Sequence/Grantham.hpp>
36 #include <Sequence/SingleSub.hpp>
37 #include <Sequence/TwoSubs.hpp>
38 #include <Sequence/ThreeSubs.hpp>
39 #include <Sequence/Sites.hpp>
40 #include <Sequence/Kimura80.hpp>
42 #include <Sequence/Comeron95.hpp>
43 
49 namespace Sequence
50 {
51  struct Comeron95::Com95impl
52  {
53  double Qs, Bs, Qa, Ba, A2S, A4, As, A2V, A0, Aa,
54  q0, q2S, q2V, q4, p0, p2S, p2V, p4,Ka,Ks;
55  std::unique_ptr<RedundancyCom95> sitesObj;
56  GeneticCodes code;
57  void diverge(const Sequence::Seq & seq1, const Sequence::Seq & seq2,
58  const WeightingScheme2 *_weights2,
59  const WeightingScheme3 *_weights3,
60  const int maxhits);
61  void omega (const Sites * s,
62  const Sequence::Seq & seqobj1, const Sequence::Seq & seqobj2);
63  Com95impl(GeneticCodes __code):
64  Qs(0.), Bs(0.), Qa(0.), Ba(0.), A2S(0.), A4(0.), As(0.), A2V(0.), A0(0.), Aa(0.),
65  q0(0.), q2S(0.), q2V(0.), q4(0.), p0(0.), p2S(0.), p2V(0.), p4(0.),Ka(0.),Ks(0.),
66  sitesObj(std::unique_ptr<RedundancyCom95>(new RedundancyCom95(__code))),
67  code(__code)
68  {
69  }
70 
71  /*
72  assert (max >= 1 && max <=3);
73  //check that data are sane
74 
75  //must be same length
76  if(! (seqa.length() == seqb.length()))
77  {
78  throw SeqException("Sequence::Comeron95 -- sequences of unequal lengths");
79  }
80  //must be multiples of 3 in length
81  if(!(fabs(double(seqa.length()%3)-0.)<=DBL_EPSILON ) ||
82  !(fabs(double(seqb.length()%3)-0.)<=DBL_EPSILON ) )
83  {
84  throw (SeqException("Sequence::Comeron95 -- sequence lengths are not multiples of 3"));
85  }
86 
87  //If no weighting schemes are passed to the constructor,
88  //then we default to using Grantham's distances
89  if(_weights2==nullptr)
90  {
91  __2wasNULL=true;
92  weights2 = new GranthamWeights2(code);
93  }
94 
95  if( _weights3 == nullptr)
96  {
97  __3wasNULL=true;
98  weights3 = new GranthamWeights3(code);
99  }
100 
101  maxhits = max;
102 
103  q0 = 0.0;
104  q2S = 0.0;
105  q2V = 0.0;
106  q4 = 0.0;
107  p0 = 0.0;
108  p2S = 0.0;
109  p2V = 0.0;
110  p4 = 0.0;
111  */
112 
113  double ka (void) const;
114  double ks (void) const;
115  double ratio (void) const;
116  double P0 (void) const;
117  double P2S (void) const;
118  double P2V (void) const;
119  double P4 (void) const;
120  double Q0 (void) const;
121  double Q2S (void) const;
122  double Q2V (void) const;
123  double Q4 (void) const;
124  double as (void) const;
125  double aa (void) const;
126  double bs (void) const;
127  double ba (void) const;
128  double L0 (const Sites *) const;
129  double L2S (const Sites * ) const;
130  double L2V (const Sites *) const;
131  double L4 (const Sites *) const;
132  };
133 
134  Comeron95::Comeron95( GeneticCodes code ) : impl(std::unique_ptr<Com95impl>(new Com95impl(code)))
135  {
136  }
137 
138  Comeron95::~Comeron95(){}
139 
157  Com95_t Comeron95::operator()(const Sequence::Seq & seqa,
158  const Sequence::Seq & seqb,
159  int max)
160  {
161  GranthamWeights2 w2;
162  GranthamWeights3 w3;
163  return this->operator()(seqa,seqb,&w2,&w3,max);
164  }
165 
166  Com95_t Comeron95::operator()(const Sequence::Seq & seqa,
167  const Sequence::Seq & seqb,
168  const WeightingScheme2 * weights2,
169  const WeightingScheme3 * weights3,
170  int maxdiffs)
171  {
172  Sites s(seqa,seqb,(*impl->sitesObj),maxdiffs);
173  impl->diverge(seqa,seqb,weights2,weights3,maxdiffs);
174  impl->omega(&s,seqa,seqb);
175  return Com95_t({{impl->ka(),
176  impl->ks(),
177  impl->ratio(),
178  impl->P0(),
179  impl->P2S(),
180  impl->P2V(),
181  impl->P4(),
182  impl->Q0(),
183  impl->Q2S(),
184  impl->Q2V(),
185  impl->Q4(),
186  impl->as(),
187  impl->aa(),
188  impl->bs(),
189  impl->ba(),
190  impl->L0(&s),
191  impl->L2S(&s),
192  impl->L2V(&s),
193  impl->L4(&s)
194  }});
195  }
196 
197  void Comeron95::Com95impl::omega (const Sites * s,
198  const Sequence::Seq & seqobj1,
199  const Sequence::Seq & seqobj2)
209  {
210  double log1, log2;
211 
212  Qs = (q2V + q4) / (s->L2V() + s->L4());
213 
214  if (!std::isfinite (Qs))
215  Qs = 0.0;
216 
217  Bs = (-0.5) * log (1.0 - (2.0 * Qs));
218 
219  Qa = (q0 + q2S) / (s->L0() + s->L2S());
220 
221  if (!std::isfinite (Qa))
222  Qa = 0.0;
223 
224  Ba = (-0.5) * std::log (1.0 - (2.0 * Qa));
225  if (!std::isfinite (Ba))// && !isinf(Ba))
226  {
227  //if sites aren't saturated in general (i.e. Kimura's distance < 1.0)
228  //it is likely that Ba is nan due to too few changes, and thus Ba should equal 0.0
229  //otherwise it is due to too many changes.
230  //NOTE--this is an ad-hoc treatment of the analysis!!!
231  std::unique_ptr<Kimura80> K80( new Kimura80 (&seqobj1, &seqobj2));
232  if (K80->K () < 1.0)
233  Ba = 0.0;
234  }
235 
236  //calculate numbers of mutation per site type
237  double P2S_site = p2S / s->L2S();
238  double P2V_site = p2V / s->L2V();
239  double P0_site = p0 / s->L0();
240  double Q0_site = q0 / s->L0();
241  double P4_site = p4 / s->L4();
242  double Q4_site = q4 / s->L4();
243 
244  log1 = std::log (1.0 - (2.0 * P2S_site) - Qa);
245  log2 = std::log (1.0 - (2.0 * Qa));
246 
247  if (!std::isfinite (log1)) //set value to 0 if nan
248  log1 = 0.0;
249  if (!std::isfinite (log2))
250  log2 = 0.0;
251 
252  A2S = (-0.5) * log1 + (0.25) * log2;
253 
254  log1 = std::log (1.0 - (2.0 * P4_site) - Q4_site);
255  log2 = std::log (1.0 - (2.0 * Q4_site));
256 
257  if (!std::isfinite (log1))
258  log1 = 0.0;
259  if (!std::isfinite (log2))
260  log2 = 0.0;
261 
262  A4 = (-0.5) * log1 + (0.25) * log2;
263 
264  As = (s->L2S() * A2S + s->L4() * A4) / (s->L2S() +
265  s->L4());
266 
267  log1 = std::log (1.0 - (2.0 * P2V_site) - Qs);
268  log2 = std::log (1.0 - (2.0 * Qs));
269 
270  if (!std::isfinite (log1))
271  log1 = 0.0;
272  if (!std::isfinite (log2))
273  log2 = 0.0;
274 
275  A2V = (-0.5) * log1 + (0.25) * log2;
276 
277  log1 = std::log (1.0 - (2.0 * P0_site) - Q0_site);
278  log2 = std::log (1.0 - (2.0 * Q0_site));
279  if (!std::isfinite (log1))
280  log1 = 0.;
281  if (!std::isfinite (log2))
282  log2 = 0.0;
283 
284  A0 = (-0.5) * log1 + (0.25) * log2;
285 
286  Aa = (s->L2V() * A2V + s->L0() * A0) / (s->L2V() +
287  s->L0());
288 
289  if (As <= 0.0)
290  As = 0.0;
291  if (Bs <= 0.0)
292  Bs = 0.0;
293  if (Aa <= 0.0)
294  Aa = 0.0;
295  if (Ba <= 0.0)
296  Ba = 0.0;
297 
298  Ks = As + Bs;
299  Ka = Aa + Ba;
300 
301  if (!std::isfinite (Ks))
302  Ks = std::numeric_limits<double>::quiet_NaN();
303  if (!std::isfinite (Ka))
304  Ka = std::numeric_limits<double>::quiet_NaN();
305  }
306 
307  void Comeron95::Com95impl::diverge (const Sequence::Seq & seq1,
308  const Sequence::Seq & seq2,
309  const WeightingScheme2 *weights2,
310  const WeightingScheme3 *weights3,
311  const int maxdiffs)
317  {
318  size_t i, j;
319  size_t length = seq1.length();
320  q0= q2S= q2V= q4= p0= p2S= p2V= p4 = 0.;
321  //the for loop iterates over codons (block of 3 sites)
322  std::string codon1, codon2;
323  codon1.resize (3);
324  codon2.resize (3);
325 
326  for (i = 0; i <= (length - 3); i += 3)
327  {
328  for (j = 0; j <= 2; ++j)
329  {
330  //assign the next codon from each sequence
331  codon1[j] = char(std::toupper(seq1[i + j]));
332  codon2[j] = char(std::toupper(seq2[i + j]));
333  }
334  if ( std::find_if(codon1.begin(),codon1.end(),ambiguousNucleotide()) == codon1.end() &&
335  std::find_if(codon2.begin(),codon2.end(),ambiguousNucleotide()) == codon2.end() )
336  {
337  //find out if codons are different
338  if (Different (codon1, codon2))
339  {
340  //find out how many changes there are between the codons
341  int ndiff = NumDiffs (codon1, codon2);
342  if (ndiff == 1) //if there is just one difference, use the rules for
343  //codons that only differ at 1 site
344  {
345  SingleSub Single;
346  Single(*sitesObj.get(), codon1, codon2);
347  p0 += Single.P0 ();
348  p2S += Single.P2S ();
349  p2V += Single.P2V ();
350  p4 += Single.P4 ();
351  q0 += Single.Q0 ();
352  q2S += Single.Q2S ();
353  q2V += Single.Q2V ();
354  q4 += Single.Q4 ();
355  }
356 
357  if (maxdiffs > 1)
358  { //if codons with >1 difference are allowed
359  //iterate over codons, as above
360 
361  if (Different
362  (codon1, codon2))
363  {
364  ndiff = NumDiffs
365  (codon1,
366  codon2);
367  //cout << "CHECK:NDIFF="<<ndiff<<endl;
368  if (ndiff == 2
369  && maxdiffs >= 2)
370  { //if codons differ at 2 sites,
371  //use rules of class TwoSubstitutions
372  TwoSubs Double;
373  Double(*sitesObj.get(), codon1, codon2,weights2);
374  p0 += Double. P0();
375  p2S += Double.P2S ();
376  p2V += Double.P2V ();
377  p4 += Double.P4();
378  q0 += Double.Q0 ();
379  q2S += Double.Q2S ();
380  q2V += Double.Q2V ();
381  q4 += Double.Q4 ();
382  }
383  else if (ndiff == 3 && maxdiffs > 2)
384  {
385  ThreeSubs Triple;
386  Triple(*sitesObj.get(),codon1, codon2,weights3);
387  p0 += Triple. P0 ();
388  p2S += Triple.P2S ();
389  p2V += Triple.P2V ();
390  p4 += Triple. P4 ();
391  q0 += Triple.Q0 ();
392  q2S += Triple.Q2S ();
393  q2V += Triple.Q2V ();
394  q4 += Triple.Q4 ();
395  }
396  }
397  }
398  }
399  }
400  }
401 
402  if (!std::isfinite (p0))
403  p0 = 0.0;
404  if (!std::isfinite (p2S) )
405  p2S = 0.0;
406  if (!std::isfinite (p2V) )
407  p2V = 0.0;
408  if (!std::isfinite (p4) )
409  p4 = 0.0;
410  if (!std::isfinite (q0) )
411  q0 = 0.0;
412  if (!std::isfinite (q2S) )
413  q2S = 0.0;
414  if (!std::isfinite (q2V) )
415  q2V = 0.0;
416  if (!std::isfinite (q4) )
417  q4 = 0.0;
418  }
419 
420  double Comeron95::Com95impl::L0 (const Sites * sites) const
424  {
425  return sites->L0();
426  }
427  double Comeron95::Com95impl::L2S (const Sites * sites) const
431  {
432  return sites->L2S();
433  }
434  double Comeron95::Com95impl::L2V (const Sites * sites) const
438  {
439  return sites->L2V();
440  }
441  double Comeron95::Com95impl::L4 (const Sites * sites) const
445  {
446  return sites->L4();
447  }
448 
449  double Comeron95::Com95impl::as (void) const
453  {
454  if (!std::isfinite ( As))
455  return std::numeric_limits<double>::quiet_NaN();
456  return As;
457  }
458  double Comeron95::Com95impl::aa (void) const
462  {
463  if (!std::isfinite ( Aa))
464  return std::numeric_limits<double>::quiet_NaN();
465  return Aa;
466  }
467  double Comeron95::Com95impl::bs (void) const
471  {
472  if (!std::isfinite ( Bs))
473  return std::numeric_limits<double>::quiet_NaN();
474  return Bs;
475  }
476  double Comeron95::Com95impl::ba (void) const
480  {
481  if (!std::isfinite ( Ba))
482  return std::numeric_limits<double>::quiet_NaN();
483  return Ba;
484  }
485 
486  double Comeron95::Com95impl::ratio(void) const
491  {
492  if ( !std::isfinite(Ka) || !std::isfinite(Ks) || fabs(Ks-0.) <= DBL_EPSILON)
493  return std::numeric_limits<double>::quiet_NaN();
494  return Ka / Ks;
495  }
496 
497  double Comeron95::Com95impl::ka (void) const
502  {
503  return Ka;
504  }
505  double Comeron95::Com95impl::ks (void) const
510  {
511  return Ks;
512  }
513 
514  double Comeron95::Com95impl::P0 (void) const
518  {
519  return p0;
520  }
521  double Comeron95::Com95impl::P2S (void) const
525  {
526  return p2S;
527  }
528  double Comeron95::Com95impl::P2V (void) const
532  {
533  return p2V;
534  }
535  double Comeron95::Com95impl::P4 (void) const
539  {
540  return p4;
541  }
542  double Comeron95::Com95impl::Q0 (void) const
546  {
547  return q0;
548  }
549  double Comeron95::Com95impl::Q2S (void) const
553  {
554  return q2S;
555  }
556  double Comeron95::Com95impl::Q2V (void) const
560  {
561  return q2V;
562  }
563  double Comeron95::Com95impl::Q4 (void) const
567  {
568  return q4;
569  }
570 }
double P2S(void) const
Definition: SingleSub.cc:78
double Q0(void) const
Definition: SingleSub.cc:104
declaration of Sequence::Kimura80
double Q2S(void) const
Definition: TwoSubs.cc:204
double Q2S(void) const
Definition: SingleSub.cc:113
abstract interface to weighting schemes when codons differ at 2 positions
class Sequence::Seq, an abstract base class for Sequences
Abstract interface to sequence objects.
Definition: Seq.hpp:46
declaration of Sequence::Comeron95 to calculate Ka/Ks
Kimura&#39;s 2-parameter distance.
Definition: Kimura80.hpp:51
Com95_t operator()(const Sequence::Seq &seqa, const Sequence::Seq &seqb, int maxdiffs=3)
Definition: Comeron95.cc:157
double L2V(void) const
Definition: Sites.cc:147
double Q2V(void) const
Definition: SingleSub.cc:122
double P2S(void) const
Definition: ThreeSubs.cc:237
Tests if a character is in the set A,G,C,T.
used by Sequence::Comeron95, class Sequence::TwoSubs calculates divergence between codons that differ...
class Sequence::Sites calculates the length of a pairwise alignment between coding sequences in terms...
STL namespace.
The namespace in which this library resides.
int NumDiffs(const std::string &seq1, const std::string &seq2, const bool &skip_missing=true, const bool &nucleic_acid=true)
Definition: Comparisons.cc:141
double Q0(void) const
Definition: TwoSubs.cc:195
Deal with codons differing at 1 position.
Definition: SingleSub.hpp:45
declaration of classes to weight codons by Grantham distance (i.e. for Sequence::Comeron95). Declares Sequence::GranthamWeights2 and Sequence::GranthamWeights3
class Sequence::RedundancyCom95
used by Sequence::Comeron95, class Sequence::ThreeSubs calculates divergence between codons that diff...
Grantham&#39;s distances (Sequence::Grantham)
double P2S(void) const
Definition: TwoSubs.cc:168
Weights paths by Grantham&#39;s distances for codons differing at 3 sites.
used by Sequence::Comeron95, class Sequence::SingleSub calculates divergence between codons that diff...
double L4(void) const
Definition: Sites.cc:154
abstract interface to weighting schemes when codons differ at 3 positions
Calculate length statistics for divergence calculations.
Definition: Sites.hpp:58
double L0(void) const
Definition: Sites.cc:133
size_type length(void) const
Definition: Seq.cc:58
double Q0(void) const
Definition: ThreeSubs.cc:252
double P0(void) const
Definition: SingleSub.cc:69
double Q2S(void) const
Definition: ThreeSubs.cc:257
double L2S(void) const
Definition: Sites.cc:140
double P4(void) const
Definition: SingleSub.cc:96
double P2V(void) const
Definition: TwoSubs.cc:177
bool Different(const std::string &seq1, const std::string &seq2, const bool &skip_missing=true, const bool &nucleic_acid=true)
Definition: Comparisons.cc:98
double Q4(void) const
Definition: SingleSub.cc:131
Weights paths by Grantham&#39;s distances for codons differing at 2 sites.
double P2V(void) const
Definition: SingleSub.cc:87
double Q2V(void) const
Definition: ThreeSubs.cc:262
Deal with codons differing at all 3 positions.
Definition: ThreeSubs.hpp:52
double P4(void) const
Definition: TwoSubs.cc:186
delcaration of routines for comparing DNA sequences This file declares a set of functions useful for ...
double Q2V(void) const
Definition: TwoSubs.cc:213
double P2V(void) const
Definition: ThreeSubs.cc:242
double Q4(void) const
Definition: ThreeSubs.cc:267
Deal with codons differing at 2 positions.
Definition: TwoSubs.hpp:61
double Q4(void) const
Definition: TwoSubs.cc:222