libsequence  1.9.5
Recombination.cc
1 /*
2  Copyright (C) 2003-2009 Kevin Thornton, krthornt[]@[]uci.edu
3 
4  Remove the brackets to email me.
5 
6  This file is part of libsequence.
7 
8  libsequence is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  libsequence is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  long with libsequence. If not, see <http://www.gnu.org/licenses/>.
20 */
21 
22 #include <cmath>
23 #include <cfloat>
24 #include <algorithm>
25 #include <limits>
26 #include <cctype>
27 #include <Sequence/PolyTable.hpp>
28 #include <Sequence/SimData.hpp>
32 
33 using std::pow;
34 using std::vector;
35 
36 namespace Sequence
37 {
38  PairwiseLDstats::PairwiseLDstats()
39  : i(std::numeric_limits<double>::quiet_NaN()), j(i), rsq(i), D(i),
40  Dprime(i), skipped(true)
41  {
42  }
43  namespace Recombination
44  {
48  const double CMAX = 10000;
49 
50  namespace
51  {
52  // These functions are implementation details of Hudson's C, and
53  // are in an unnamed namespace to restrict their scope to this
54  // file,
55  // which is the C++ equivalent of a function declared with storage
56  // class static
57  double CalcSamplingVariance(const Sequence::PolyTable *data,
58  const bool &haveOutgroup,
59  const unsigned &outgroup,
60  const int &totsam, const int &ss);
61  void GetPairDiffs(const Sequence::PolyTable *data,
62  const bool &haveOutgroup,
63  const unsigned &outgroup, const int &totsam,
64  vector<int> &list, const int &ss);
65  void get_h_vals(const int &ss, const int &totsam,
66  const Sequence::PolyTable *data,
67  const bool &haveOutgroup, const unsigned &outgroup,
68  double *sumhi, double *sumhisq);
69  double chatsub(const int &totsam, const double &sksq,
70  const double &sumhi, const double &sumhisq);
71  double solveit(const int &nsam, const double &stat);
72  double g(const int &nsam, const double &c);
73 
74  double
75  CalcSamplingVariance(const Sequence::PolyTable *data,
76  const bool &haveOutgroup,
77  const unsigned &outgroup, const int &totsam,
78  const int &ss)
84  {
85  unsigned i;
86  double Svar = 0.0, meandiffs = 0.0;
87  double nsam = double(totsam);
88  vector<int> pairdiffs;
89  GetPairDiffs(data, haveOutgroup, outgroup, totsam, pairdiffs,
90  ss);
91  // increment __meandiffs to avoid loss of precision when
92  // incrementing doubles
93  for (i = 0; unsigned(i) < pairdiffs.size(); ++i)
94  {
95  meandiffs += double(pairdiffs[i]);
96  }
97  meandiffs /= pow(nsam, 2.0);
98  for (i = 0; unsigned(i) < pairdiffs.size(); ++i)
99  Svar += pow((double(pairdiffs[i]) - meandiffs), 2);
100  Svar /= pow(nsam, 2.);
101  return (Svar);
102  }
103 
104  void
105  GetPairDiffs(const Sequence::PolyTable *data,
106  const bool &haveOutgroup, const unsigned &outgroup,
107  const int &totsam, vector<int> &list, const int &ss)
114  {
115  unsigned i, j, k;
116  for (i = 0; int(i) < totsam; ++i)
117  {
118  for (j = 0; int(j) < totsam; ++j)
119  {
120  if ((!(haveOutgroup))
121  || (haveOutgroup && i != unsigned(outgroup)
122  && j != unsigned(outgroup)))
123  {
124  if (i != j)
125  {
126  int ndiffs = 0;
127  for (k = 0; k < unsigned(ss);
128  ++k)
129  {
130  // comparison to 'N'
131  // has the effect of
132  // skipping missing
133  // data
134  char ch1 = char(
135  std::toupper(
136  (*data)
137  [i]
138  [k])),
139  ch2 = char(
140  std::toupper(
141  (*data)
142  [j]
143  [k]));
144  if (ch1 != 'N'
145  && ch2 != 'N')
146  if (ch1 != ch2)
147  ++ndiffs;
148  }
149  list.push_back(ndiffs);
150  }
151  else
152  list.push_back(0);
153  }
154  }
155  }
156  }
157 
158  void
159  get_h_vals(const int &ss, const int &totsam,
160  const Sequence::PolyTable *data,
161  const bool &haveOutgroup, const unsigned &outgroup,
162  double *sumhi, double *sumhisq)
169  {
170  *sumhi = 0.;
171  *sumhisq = 0.;
172 
173  if (ss == 0)
174  {
175  // don't need to change anything
176  // *sumhi = 0.;
177  // *sumhisq = 0.;
178  }
179  else
180  {
181  for (int i = 0; i < ss; ++i)
182  {
183  stateCounter Counts;
184  // samplesize is counted per site, and is
185  // adjusted if missing data is encountere
186  int samplesize = totsam;
187  for (unsigned j = 0; j < data->size(); ++j)
188  {
189  if ((!(haveOutgroup))
190  || (haveOutgroup
191  && j != unsigned(outgroup)))
192  {
193  samplesize
194  -= (std::toupper((
195  *data)[j][unsigned(
196  i)])
197  == 'N')
198  ? 1
199  : 0;
200  Counts(
201  (*data)[j][unsigned(i)]);
202  }
203  }
204 
205  // a copy-paste from ThetaPi() from PolySNP.cc
206  double SSH = 0.;
207  double denom = (double(samplesize)
208  * (double(samplesize) - 1.0));
209  SSH += (Counts.a > 0)
210  ? double(Counts.a)
211  * double(Counts.a - 1) / denom
212  : 0.;
213  SSH += (Counts.g > 0)
214  ? double(Counts.g)
215  * double(Counts.g - 1) / denom
216  : 0.;
217  SSH += (Counts.c > 0)
218  ? double(Counts.c)
219  * double(Counts.c - 1) / denom
220  : 0.;
221  SSH += (Counts.t > 0)
222  ? double(Counts.t)
223  * double(Counts.t - 1) / denom
224  : 0.;
225  SSH += (Counts.zero > 0)
226  ? double(Counts.zero)
227  * double(Counts.zero - 1)
228  / denom
229  : 0.;
230  SSH += (Counts.one > 0)
231  ? double(Counts.one)
232  * double(Counts.one - 1)
233  / denom
234  : 0.;
235  *sumhi += (1.0 - SSH);
236  *sumhisq += pow(1.0 - SSH, 2.0);
237  }
238  }
239  }
240 
241  double
242  chatsub(const int &totsam, const double &sksq, const double &sumhi,
243  const double &sumhisq)
248  {
249  double stat, thetahat, estimate;
250  thetahat = sumhi;
251  stat = (sksq - sumhi + sumhisq) / (thetahat * thetahat);
252  estimate = solveit(totsam, stat);
253  return (estimate);
254  }
255 
256  double
257  solveit(const int &totsam, const double &stat)
263  {
264  double xbig, xsmall;
265 
266  xbig = 10.;
267  xsmall = std::numeric_limits<float>::epsilon();
268 
269  if (fabs(g(totsam, xsmall)) - stat
270  <= std::numeric_limits<double>::epsilon())
271  return (xsmall);
272 
273  while (fabs(g(totsam, xbig)) - stat
274  >= std::numeric_limits<double>::epsilon()
275  && xbig < CMAX)
276  xbig += 10.;
277  if ((xbig >= CMAX)
278  && fabs(g(totsam, xbig) - stat)
279  > std::numeric_limits<double>::epsilon())
280  return (CMAX);
281  if (xbig > 10.)
282  xsmall = xbig - 10.;
283 
284  while ((xbig - xsmall) > std::numeric_limits<float>::epsilon())
285  {
286  double xtemp = (xbig + xsmall) / 2.;
287  if (fabs(g(totsam, xtemp)) - stat
288  > std::numeric_limits<double>::epsilon())
289  xsmall = xtemp;
290  else
291  xbig = xtemp;
292  }
293  return ((xbig + xsmall) / 2.);
294  }
295 
296  double
297  g(const int &totsam, const double &c)
303  {
304  double x, y, esk;
305  double r97, i1, i2, sumpd, n, b1, b2, b3;
306 
307  n = double(totsam);
308 
309  r97 = sqrt(97.);
310  x = 13. + r97;
311  y = 13. - r97;
312  i2 = (log(((2. * c + y) * x) / ((2. * c + x) * y))) / r97;
313  i1 = (0.5) * log((c * c + 13. * c + 18.) / 18.)
314  - 13. * i2 / 2.;
315  sumpd
316  = (-c + (c - 1.) * i1 + 2. * (7. * c + 9.) * i2) / (c * c);
317  b1 = 1. / 2. + 1. / c
318  + ((5. - c) * i1 - 18. * (c + 1.) * i2) / (c * c);
319  b2 = -1. / 2. + 2. / c
320  - 2. * ((c + 9.) * i1 + 2. * (2. * c + 9.) * i2)
321  / (c * c);
322  b3 = -2. / c
323  + 2. * ((c + 7.) * i1 + 6. * (c + 3.) * i2) / (c * c);
324  esk = sumpd + b1 / n + b2 / (n * n) + b3 / (n * n * n);
325  return (2. * esk);
326  }
327  } // namespace
328  double
329  HudsonsC(const Sequence::PolyTable *data, const bool &haveOutgroup,
330  const unsigned &outgroup)
339  {
340  int totsam = int(data->size());
341  if (haveOutgroup)
342  --totsam;
343  int ss = int((*data)[0].length());
344  double sumhi = 0.0, sumhisq = 0.0;
345  get_h_vals(ss, totsam, data, haveOutgroup, outgroup, &sumhi,
346  &sumhisq);
347  double sksq = CalcSamplingVariance(data, haveOutgroup, outgroup,
348  totsam, ss);
349  return (chatsub(totsam, sksq, sumhi, sumhisq));
350  }
351 
352  std::vector<PairwiseLDstats>
354  const bool &haveOutgroup, const unsigned &outgroup,
355  const unsigned &mincount, const double max_distance)
356  {
357  using return_type = std::vector<PairwiseLDstats>;
358  if (data->empty() || data->numsites() < 2)
359  return return_type();
360 
361  auto nsites = data->numsites();
362  return_type rv;
363  rv.reserve(nsites);
364  for (std::size_t i = 0; i < nsites - 1; ++i)
365  {
366  for (std::size_t j = i + 1; j < nsites; ++i)
367  {
368  rv.push_back(PairwiseLD(data, i, j, haveOutgroup,
369  outgroup, mincount,
370  max_distance));
371  }
372  }
373  return rv;
374  }
375 
377  PairwiseLD(const Sequence::PolyTable *data, unsigned i, unsigned j,
378  const bool &haveOutgroup, const unsigned &outgroup,
379  const unsigned &mincount, const double max_distance)
380  {
381  typedef Sequence::PolyTable::size_type USIZE;
382  unsigned ss = data->numsites();
383  if (!(i < ss - 1))
384  {
385  throw std::runtime_error("site index i is out of range, "
386  + std::string(__FILE__) + ", line"
387  + std::to_string(__LINE__));
388  }
389  if (j <= i)
390  {
391  throw std::runtime_error("j must be > i, "
392  + std::string(__FILE__) + ", line"
393  + std::to_string(__LINE__));
394  }
395  PairwiseLDstats rv;
396  rv.skipped = 1;
397  const char _alphabet[10]
398  = { 'A', 'G', 'C', 'T', '0', '1', 'a', 'g', 'c', 't' };
399  static const unsigned alphsize = 6;
400  PolyTable::const_site_iterator beg = data->sbegin();
401  const double pos1 = (beg + i)->first, pos2 = (beg + j)->first;
402 #ifndef NDEBUG
403  const double op1 = pos1, op2 = pos2;
404 #endif
405  const USIZE datasize = data->size();
406  const USIZE nsam = datasize - USIZE(haveOutgroup);
407 
408  // chars1.first is the character state of the low-frequency/derived
409  // allele at site i
410  // chars1.second is the character state of the
411  // high-frequency/ancestral allele at site i
412  // chars2 is for sitej
413  std::pair<char, char> chars1, chars2;
414  // counts2.first is the # occurences of the low-frequency/derived
415  // allele at site j
416  // counts2.second is the # occurences of the
417  // high-frequency/anecstral allele at site j
418  // counts2 is for site j
419  std::pair<USIZE, USIZE> counts1, counts2;
420 
421  chars1.first
422  = 'Z'; // assign to a dummy character not in the alphabet
423  chars1.second = 'Z';
424  counts1.first = 0;
425  counts1.second = 0;
426  unsigned states1 = 0;
427  std::string site1((beg + i)->second);
428  for (unsigned letter = 0; letter < alphsize; ++letter)
429  {
430  std::transform(site1.begin(), site1.end(), site1.begin(),
431  ::toupper);
432  if (std::find(site1.begin(), site1.end(),
433  _alphabet[letter])
434  != site1.end())
435  {
436  if (chars1.first == 'Z')
437  {
438  chars1.first = char(
439  std::toupper(_alphabet[letter]));
440  ++states1;
441  }
442  else
443  {
444  chars1.second = char(
445  std::toupper(_alphabet[letter]));
446  if (chars1.first != chars1.second)
447  {
448  ++states1;
449  }
450  }
451  }
452  }
453  // skip if site i has > 2 sites4
454  if (states1 == 2)
455  {
456  if (haveOutgroup)
457  {
458  site1.replace(outgroup, 1, "");
459  }
460  if (std::abs(data->position(j) - data->position(i))
461  <= max_distance)
462  {
463  std::string site2((beg + j)->second);
464  std::transform(site2.begin(), site2.end(),
465  site2.begin(), ::toupper);
466  chars2.first = 'Z'; // assign to a dummy character
467  // not in the alphabet
468  chars2.second = 'Z';
469  counts2.first = 0;
470  counts2.second = 0;
471  unsigned states2 = 0;
472  for (unsigned letter = 0; letter < alphsize;
473  ++letter)
474  {
475  if (std::find(site2.begin(), site2.end(),
476  _alphabet[letter])
477  != site2.end())
478  {
479  if (chars2.first == 'Z')
480  {
481  chars2.first
482  = char(std::toupper(
483  _alphabet
484  [letter]));
485  ++states2;
486  }
487  else
488  {
489  chars2.second
490  = char(std::toupper(
491  _alphabet
492  [letter]));
493  if (chars2.first
494  != chars2.second)
495  {
496  ++states2;
497  }
498  }
499  }
500  }
501 
502  // skip if site j has > 2 states
503  if (states2 == 2)
504  {
505  if (haveOutgroup)
506  {
507  site2.replace(outgroup, 1, "");
508  }
509 
510  // skip pairs of sites with missing data
511  if (std::find(site1.begin(), site1.end(),
512  'N')
513  == site1.end()
514  && std::find(site2.begin(),
515  site2.end(), 'N')
516  == site2.end())
517  {
518  // Make sure that the .first and
519  // .second
520  // mean what we want them to for
521  // chars1 and chars2.
522  // If there is no outgroup,then
523  // .second will
524  // be the high frequency allele.
525  // If there
526  // is an outgroup, .second will be
527  // the derived
528  // allele. With this scheme,
529  // chars1.second
530  // and chars2.second together
531  // represent the 11
532  // gamete
533  if (haveOutgroup == false)
534  {
535  // count the # occurences
536  // of chars1.first
537  // at site i
538  size_t x
539  = unsigned(std::count(
540  site1.begin(),
541  site1.end(),
542  chars1.first));
543  // count the # occurences
544  // of chars2.first
545  // at site j
546  size_t y
547  = unsigned(std::count(
548  site2.begin(),
549  site2.end(),
550  chars2.first));
551 
552  if (x > nsam - x)
553  {
554  // need to swap
555  // chars1.first and
556  // chars1.second
557  // such that
558  // chars1.second is
559  // the
560  // high-frequency
561  // allele
562  std::swap(
563  chars1.first,
564  chars1.second);
565  counts1.first
566  = nsam - x;
567  counts1.second = x;
568  }
569  else
570  {
571  counts1.first = x;
572  counts1.second
573  = nsam - x;
574  }
575 
576  if (y > nsam - y)
577  {
578  // need to swap
579  // chars2.first and
580  // chars2.second
581  // such that
582  // chars2.second is
583  // the
584  // high-frequency
585  // allele
586  std::swap(
587  chars2.first,
588  chars2.second);
589  counts2.first
590  = nsam - y;
591  counts2.second = y;
592  }
593  else
594  {
595  counts2.first = y;
596  counts2.second
597  = nsam - y;
598  }
599  }
600  else if (haveOutgroup == true)
601  {
602  // if chars1.first is
603  // ancestral, swap
604  // chars1.first
605  // and chars1.second so
606  // chars1.first represents
607  // the derived state
608  if (chars1.first
609  == char(std::toupper(
610  (beg + i)->second
611  [outgroup])))
612  {
613  std::swap(
614  chars1.first,
615  chars1.second);
616  }
617  counts1.first
618  = unsigned(std::count(
619  site1.begin(),
620  site1.end(),
621  chars1.first));
622  counts1.second
623  = nsam - counts1.first;
624 
625  // if chars2.first is
626  // ancestral, swap
627  // chars2.first
628  // and chars2.second so
629  // chars2.first represents
630  // the derived state
631  if (chars2.first
632  == char(std::toupper(
633  (beg + j)->second
634  [outgroup])))
635  {
636  std::swap(
637  chars2.first,
638  chars2.second);
639  }
640  counts2.first
641  = unsigned(std::count(
642  site2.begin(),
643  site2.end(),
644  chars2.first));
645  counts2.second
646  = nsam - counts2.first;
647  }
648 
649  // don't continue unless
650  // minor/derived
651  // allele counts are greater than
652  // mincount
653  // at both sites
654  if (counts1.first >= mincount
655  && counts2.first >= mincount)
656  {
657  // now we can actually do
658  // the work...
659  assert(pos1 == op1);
660  assert(pos2 == op2);
661  rv.i = pos1;
662  rv.j = pos2;
663 
664  unsigned counts00 = 0,
665  counts01 = 0,
666  counts10 = 0,
667  counts11 = 0;
668  for (unsigned k = 0;
669  k < nsam; ++k)
670  {
671  if (site1[k]
672  == chars1
673  .first
674  && site2[k]
675  == chars2
676  .first)
677  ++counts11; // the 11 type is based on minor/derived allele
678 
679  if (site1[k]
680  == chars1
681  .first
682  && site2[k]
683  == chars2
684  .second)
685  ++counts10;
686 
687  if (site1[k]
688  == chars1
689  .second
690  && site2[k]
691  == chars2
692  .first)
693  ++counts01;
694 
695  if (site1[k]
696  == chars1
697  .second
698  && site2[k]
699  == chars2
700  .second)
701  ++counts00;
702  }
703 
704  double p0
705  = double(
706  counts1.second)
707  / double(nsam);
708  double p1
709  = double(counts1.first)
710  / double(nsam);
711  double q0
712  = double(
713  counts2.second)
714  / double(nsam);
715  double q1
716  = double(counts2.first)
717  / double(nsam);
718 
719  rv.D = double(counts11)
720  / double(nsam)
721  - p1 * q1;
722  rv.rsq = (rv.D * rv.D)
723  / (p0 * p1 * q0
724  * q1);
725  double dmin = std::max(
726  -p0 * q0, -p1 * q1);
727  double dmax = std::min(
728  p1 * q0, p0 * q1);
729  rv.Dprime
730  = (rv.D < 0
731  ? -(rv.D / dmin)
732  : rv.D / dmax);
733  rv.skipped = 0;
734  }
735  else
736  {
737  rv.skipped = 1;
738  }
739  }
740  else
741  {
742  rv.skipped = 1;
743  }
744  }
745  else
746  {
747  rv.skipped = 1;
748  }
749  }
750  else
751  {
752  rv.skipped = 1;
753  }
754  }
755  return rv;
756  }
757 
758  } // namespace Recombination
759 } // namespace Sequence
The base class for polymorphism tables.
keep track of state counts at a site in an alignment or along a sequence
double i
Position of site i.
STL namespace.
double j
Position of site j.
Sequence::PolyTable, a virtual base class for polymorphism tables.
The namespace in which this library resides.
double rsq
$r^2$ between sites i and j
std::vector< PairwiseLDstats > Disequilibrium(const Sequence::PolyTable *data, const bool &haveOutgroup=false, const unsigned &outgroup=0, const unsigned &mincount=1, const double max_distance=std::numeric_limits< double >::max()) __attribute__((deprecated))
Calculate pairwise LD for a Sequence::PolyTable.
Pairwise linkage disequilibrium (LD) stats .
double HudsonsC(const Sequence::PolyTable *data, const bool &haveOutgroup, const unsigned &outgroup) __attribute__((deprecated))
namespace Sequence::Recombination
Declaration of Sequence::SimData, a class representing polymorphism data from coalescent simulations ...
double D
The D statistics.
declaration of Sequence::stateCounter, a class to keep track of nucleotide counts either at a site in...
PairwiseLDstats PairwiseLD(const Sequence::PolyTable *data, unsigned i, unsigned j, const bool &haveOutgroup=false, const unsigned &outgroup=0, const unsigned &mincount=1, const double max_distance=std::numeric_limits< double >::max()) __attribute__((deprecated))
bool skipped
If the site fails and filters, this is true.