libsequence  1.9.5
testGarudStatistics.cc
Go to the documentation of this file.
1 
3 #include <cmath>
4 #include <algorithm>
5 #include <vector>
6 #include <string>
7 #include <set>
8 #include <iostream>
9 #include <Sequence/VariantMatrix.hpp>
10 #include <Sequence/VariantMatrixViews.hpp>
13 #include "VariantMatrixFixture.hpp"
14 #include <boost/test/unit_test.hpp>
15 
16 BOOST_FIXTURE_TEST_SUITE(test_garud_stats, dataset)
17 
18 BOOST_AUTO_TEST_CASE(test_garud_stats)
19 /* This test compares the results to alternative
20  * (and less efficient) implementations
21  * of the same procedures.
22  */
23 {
24  auto G = Sequence::garud_statistics(m);
25 
26  //Their H1 stat is the same as 1 - haplotype diversity.
27  auto hdiv = Sequence::haplotype_diversity(m);
28  // This test is a cheat, as this is exactly
29  // how H1 is calculated internally
30  BOOST_REQUIRE_EQUAL(1.0 - hdiv, G.H1);
31 
32  //Let's do an alternative method.
33  std::vector<std::string> haps;
34  for (auto i = 0; i < m.nsam; ++i)
35  {
36  auto c = Sequence::get_ConstColView(m, i);
37  std::string h;
38  for (auto ci : c)
39  {
40  h += (ci == 0) ? '0' : '1';
41  }
42  haps.push_back(std::move(h));
43  }
44  // unique haplotypes
45  std::set<std::string> uhaps(haps.begin(), haps.end());
46  std::vector<int> hcounts;
47  for (auto& u : uhaps)
48  {
49  hcounts.push_back(std::count(haps.begin(), haps.end(), u));
50  }
51  std::sort(hcounts.begin(), hcounts.end(), std::greater<int>());
52  double H1 = 0.0;
53  double nsam = static_cast<double>(m.nsam);
54  for (auto hc : hcounts)
55  {
56  H1 += static_cast<double>(hc) * static_cast<double>(hc - 1);
57  }
58  H1 /= ((nsam) * (nsam - 1));
59 
60  //We cannot require equality because we are summing things in
61  //different orders. Instead, we check that relative error is
62  //very low.
63  BOOST_CHECK_CLOSE(H1, G.H1, 1e-6);
64 
65  //H12 is H1, but combining the freq of the two most common haplotypes
66  decltype(hcounts) hcounts2(hcounts.begin() + 1, hcounts.end());
67  hcounts2[0] += hcounts[0];
68  double H12 = 0.0;
69  for (auto hc : hcounts2)
70  {
71  double x = static_cast<double>(hc) * static_cast<double>(hc - 1);
72  x /= (nsam * (nsam - 1));
73  H12 += x;
74  }
75  BOOST_CHECK_CLOSE(H12, G.H12, 1e-6);
76 
77  double p1sq = static_cast<double>(hcounts[0] * (hcounts[0] - 1))
78  / (nsam * (nsam - 1.0));
79  double H2 = H1 - p1sq;
80  BOOST_CHECK_CLOSE(H2 / H1, G.H2H1, 1e-6);
81 }
82 
83 BOOST_AUTO_TEST_SUITE_END()
GarudStats garud_statistics(const VariantMatrix &m)
Calculate H1, H12, and H2/H1.
Definition: garud.cc:26
H1, H12, and H2/H1 stats.
"Classic" summaries of variation data.
ConstColView get_ConstColView(VariantMatrix &m, const std::size_t col)
Return a ConstColView from VariantMatrix m at col col. std::out_of_range is thcoln if col is out of r...
double haplotype_diversity(const VariantMatrix &m)
Calculate the haplotype diversity of a sample.