libsequence  1.9.5
testClassicSummstats.cc
Go to the documentation of this file.
1 
3 #include <cmath>
4 #include <algorithm>
5 #include <set>
6 #include <string>
7 #include <vector>
8 #include <iostream>
9 #include <Sequence/VariantMatrix.hpp>
10 #include <Sequence/VariantMatrixViews.hpp>
12 #include "VariantMatrixFixture.hpp"
13 #include <boost/test/unit_test.hpp>
14 
15 static double
16 manual_pi(const Sequence::VariantMatrix& m)
17 {
18  double manual = 0.0;
19  for (std::size_t i = 0; i < m.nsites; ++i)
20  {
21  auto r = Sequence::get_ConstRowView(m, i);
22  int ndiffs = 0, ncomps = 0;
23  for (std::size_t j = 0; j < m.nsam - 1; ++j)
24  {
25  for (std::size_t k = j + 1; k < m.nsam; ++k)
26  {
27  if (r[j] >= 0 && r[k] >= 0)
28  {
29  if (r[j] != r[k])
30  {
31  ++ndiffs;
32  }
33  ++ncomps;
34  }
35  }
36  }
37  manual
38  += static_cast<double>(ndiffs) / static_cast<double>(ncomps);
39  }
40  return manual;
41 }
42 
43 static double
44 manual_thetah(const Sequence::VariantMatrix& m, const std::int8_t refstate)
45 {
46  double nnm1 = static_cast<double>(m.nsam * (m.nsam - 1));
47  double h = 0.0;
48  for (std::size_t i = 0; i < m.nsites; ++i)
49  {
50  auto r = Sequence::get_ConstRowView(m, i);
51  unsigned nnonref = 0;
52  for (auto ri : r)
53  {
54  if (ri != refstate)
55  {
56  ++nnonref;
57  }
58  }
59  h += std::pow(nnonref, 2.0);
60  }
61  h *= 2.0 / nnm1;
62  return h;
63 }
64 
65 BOOST_FIXTURE_TEST_SUITE(test_classic_stats, dataset)
66 
67 BOOST_AUTO_TEST_CASE(test_thetapi)
68 {
69  auto pi = Sequence::thetapi(c);
70  auto manual = manual_pi(m);
71  // Cannot require equal b/c we aren't doing ops
72  // in same order.
73  BOOST_CHECK_CLOSE(pi, manual, 1e-6);
74 }
75 
76 BOOST_AUTO_TEST_CASE(test_thetapi_with_mising_data)
77 // If this test passes, it implies
78 // that Sequence::StateCounter behaves
79 // correctly w.r.to handling missing data.
80 {
81  // Make a bunch of missing data, all w/
82  // different missing data encoding
83  for (int i = 1; i < static_cast<int>(m.nsites); i += 2)
84  {
85  m.data[i] = -i;
86  }
88 
89  auto manual = manual_pi(m);
90  // Cannot require equal b/c we aren't doing ops
91  // in same order.
92  BOOST_CHECK_CLOSE(pi, manual, 1e-6);
93 }
94 
95 BOOST_AUTO_TEST_CASE(test_num_poly_sites)
96 {
97  auto S = Sequence::nvariable_sites(c);
98  // Not universally true, but is true here:
99  BOOST_REQUIRE_EQUAL(m.nsites, S);
100 
102  // Not universally true, but is true here:
103  BOOST_REQUIRE_EQUAL(m.nsites, tm);
104 
105  //Make the last site invariant:
106  auto r = Sequence::get_RowView(m, m.nsites - 1);
107  std::fill(r.begin(), r.end(), 0);
110  BOOST_REQUIRE_EQUAL(S, m.nsites - 1);
112  BOOST_REQUIRE_EQUAL(m.nsites - 1, tm);
113 }
114 
115 BOOST_AUTO_TEST_CASE(test_total_num_mutations)
116 {
117  auto r = Sequence::get_RowView(m, m.nsites - 1);
118  // Add a third character state
119  r[2] = 2;
120  Sequence::VariantMatrix m2(std::move(m.data), std::move(m.positions));
123  BOOST_REQUIRE_EQUAL(tm, m.nsites + 1);
124 }
125 
126 BOOST_AUTO_TEST_CASE(test_nbiallelic_sites)
127 {
128  auto S2 = Sequence::nbiallelic_sites(c);
129  BOOST_REQUIRE_EQUAL(S2, m.nsites);
130  auto r = Sequence::get_RowView(m, m.nsites - 1);
131  // Add a third character state
132  r[2] = 2;
133  Sequence::VariantMatrix m2(std::move(m.data), std::move(m.positions));
136  BOOST_REQUIRE_EQUAL(S2, m.nsites - 1);
137 }
138 
139 BOOST_AUTO_TEST_CASE(test_count_alleles)
140 {
141  auto S2 = Sequence::nbiallelic_sites(c);
142  auto ac = Sequence::allele_counts(c);
143  auto nb = 0;
144  for (auto i : ac)
145  {
146  if (i.nstates == 2)
147  {
148  ++nb;
149  }
150  }
151  BOOST_REQUIRE_EQUAL(S2, nb);
152 }
153 
154 BOOST_AUTO_TEST_CASE(test_thetaw)
155 // The above two tests imply that
156 // thetaw is correct. Thus,
157 // all we're doing below is
158 // checking the denominator.
159 {
160  auto w = Sequence::thetaw(c);
161  double S = m.nsites;
162  double d = 0.0;
163  for (int i = 1; i < m.nsam; ++i)
164  {
165  d += 1. / static_cast<double>(i);
166  }
167  auto manual = S / d;
168  BOOST_CHECK_CLOSE(w, manual, 1e-6);
169 }
170 
171 BOOST_AUTO_TEST_CASE(test_thetah)
172 // Simplest case
173 {
174  auto h0 = Sequence::thetah(c, 0);
175  auto h1 = Sequence::thetah(c, 1);
176  auto m0 = manual_thetah(m, 0);
177  auto m1 = manual_thetah(m, 1);
178  BOOST_CHECK_CLOSE(h0, m0, 1e-6);
179  BOOST_CHECK_CLOSE(h1, m1, 1e-6);
180 }
181 
182 BOOST_AUTO_TEST_CASE(test_faywuh)
183 // Fay and Wu's H is calculated using
184 // an aggregation distinct from
185 // thetah and thetapi. That means
186 // we can compare results from the various functions
187 // in order to test.
188 {
189  auto h = Sequence::thetah(c, 0);
190  auto pi = Sequence::thetapi(c);
191  auto fwh = Sequence::faywuh(c, 0);
192  BOOST_CHECK_EQUAL(fwh + h, pi);
193  BOOST_REQUIRE_EQUAL(pi - h, fwh);
194 }
195 
196 BOOST_AUTO_TEST_CASE(test_thetah_multiple_derived_states)
197 {
198  // Create a site with > 2 derived states
199  auto f = Sequence::get_RowView(m, 0);
200  for (std::size_t i = 0; i < f.size(); ++i)
201  {
202  f[i] = (i % 2 == 0.0) ? 2 : 3;
203  if (i % 3 == 0.0)
204  {
205  f[i] = 4;
206  }
207  }
208  //We have to make data copies here so that
209  //max_allele is reset.
211  BOOST_REQUIRE_THROW(auto h
213  std::runtime_error);
214  for (std::size_t i = 0; i < f.size(); ++i)
215  {
216  f[i] = 3;
217  }
219  BOOST_REQUIRE_NO_THROW(
221 }
222 
223 BOOST_AUTO_TEST_CASE(test_num_haplotypes)
224 {
225  auto nh = Sequence::number_of_haplotypes(m);
226  BOOST_REQUIRE_EQUAL(nh, 5);
227 }
228 
229 BOOST_AUTO_TEST_CASE(test_unique_hap_at_any_index)
230 {
231  for (std::size_t i = 0; i < m.nsam; ++i)
232  {
234  // We make a unique haplotype at this index in our copy of
235  // the fixture
236  auto cv = Sequence::get_ColView(m2, i);
237  for (auto& j : cv)
238  {
239  j = 2;
240  }
241  auto nh = Sequence::number_of_haplotypes(m2);
242  std::vector<std::string> haps;
243  for (std::size_t j = 0; j < m2.nsam; ++j)
244  {
245  auto cvj = Sequence::get_ConstColView(m2, j);
246  std::string h;
247  for (auto state : cvj)
248  h += state;
249  haps.push_back(std::move(h));
250  }
251  std::set<std::string> uhaps(haps.begin(), haps.end());
252  BOOST_REQUIRE_EQUAL(uhaps.size(), nh);
253  }
254 }
255 
256 BOOST_AUTO_TEST_CASE(test_haplotype_diversity)
257 {
258  auto hd = Sequence::haplotype_diversity(m);
259 
260  // We calculate haplotype diversity here
261  // in a brute-force manner:
262  // 1. Build a vector of all haplotypes.
263  // 2. Construct a vector of all unique haplotypes
264  // 3. Explicitly cound the number of occurrences
265  // of each unique haplotype.
266  std::vector<std::vector<std::int8_t>> haps;
267  for (std::size_t i = 0; i < m.nsam; ++i)
268  {
269  auto col = Sequence::get_ConstColView(m, i);
270  haps.emplace_back(
271  std::vector<std::int8_t>(col.begin(), col.end()));
272  }
273  decltype(haps) uhaps;
274  for (auto& h : haps)
275  {
276  if (std::find(uhaps.begin(), uhaps.end(), h) == uhaps.end())
277  {
278  uhaps.push_back(h);
279  }
280  }
281  double mhd = 0.0;
282  for (auto& uh : uhaps)
283  {
284  auto c = std::count(haps.begin(), haps.end(), uh);
285  mhd += static_cast<double>(c * (m.nsam - c));
286  }
287  mhd /= static_cast<double>(m.nsam * (m.nsam - 1));
288 
289  BOOST_CHECK_CLOSE(hd, mhd, 1e-6);
290 }
291 
292 BOOST_AUTO_TEST_CASE(test_rmin)
293 {
294  auto rm = Sequence::rmin(m);
295  BOOST_REQUIRE_EQUAL(rm, 1);
296 }
297 
298 BOOST_AUTO_TEST_SUITE_END()
std::vector< AlleleCounts > allele_counts(const AlleleCountMatrix &m)
Count number of alleles at each site.
double thetah(const AlleleCountMatrix &ac, const std::int8_t refstate)
Fay and Wu&#39;s .
std::int32_t number_of_haplotypes(const VariantMatrix &m)
Calculate the number of haplotypes in a sample.
size_type size(void) const
Definition: Seq.cc:67
std::uint32_t total_number_of_mutations(const AlleleCountMatrix &m)
Total number of mutations in the sample.
ConstRowView get_RowView(const VariantMatrix &m, const std::size_t row)
Return a ConstRowView from VariantMatrix m at row row. std::out_of_range is thrown if row is out of r...
std::uint32_t nvariable_sites(const AlleleCountMatrix &m)
Number of polymorphic sites.
std::size_t nsites
Number of sites in data set.
"Classic" summaries of variation data.
double thetapi(const AlleleCountMatrix &ac)
Mean pairwise differences.
Definition: thetapi.cc:7
std::vector< double > positions
Position of sites.
double faywuh(const AlleleCountMatrix &ac, const std::int8_t refstate)
Fay and Wu&#39;s H.
Definition: faywuh.cc:8
double thetaw(const AlleleCountMatrix &ac)
Watterson&#39;s theta.
Definition: thetaw.cc:9
ConstRowView get_ConstRowView(const VariantMatrix &m, const std::size_t row)
Return a ConstRowView from VariantMatrix m at row row. std::out_of_range is thrown if row is out of r...
std::int32_t rmin(const VariantMatrix &m)
Definition: rmin.cc:11
Matrix representation of variation data.
std::uint32_t nbiallelic_sites(const AlleleCountMatrix &m)
Number of bi-allelic sites.
ColView get_ColView(VariantMatrix &m, const std::size_t col)
Return a ColView from VariantMatrix m at col col. std::out_of_range is thcoln if col is out of range...
std::size_t nsam
Sample size of data set.
ConstColView get_ConstColView(VariantMatrix &m, const std::size_t col)
Return a ConstColView from VariantMatrix m at col col. std::out_of_range is thcoln if col is out of r...
std::vector< std::int8_t > data
Data stored in matrix form with rows as sites.
double haplotype_diversity(const VariantMatrix &m)
Calculate the haplotype diversity of a sample.
Matrix representation of allele counts in a VariantMatrix To be constructed.