libsequence  1.9.5
PolySIMtest.cc
1 /*
2  Validate calculation of summary stats via PolySIM
3  by independently recoding them all. Joy.
4  */
5 #include <Sequence/SimData.hpp>
6 #include <Sequence/PolySIM.hpp>
7 #include <boost/test/unit_test.hpp>
8 #include <fstream>
9 #include <iostream>
10 #include <set>
11 #include <algorithm>
12 #include <limits>
13 #include <cmath>
14 
15 const char * testfile = "data/single_ms.txt";
16 
17 BOOST_AUTO_TEST_SUITE(PolySIMTest)
18 
19 double pi( const Sequence::SimData & d )
20 {
21  unsigned ndiffs=0,ncomps=0;
22 
23  for( unsigned i = 0 ; i < d.size() - 1 ; ++i )
24  {
25  for( unsigned j = i + 1 ; j < d.size() ; ++j )
26  {
27  ++ncomps;
28  for( unsigned k = 0 ; k < d[i].size() ; ++k )
29  {
30  ndiffs += (d[i][k] != d[j][k]) ? 1 : 0;
31  }
32  }
33  }
34  return double(ndiffs)/double(ncomps);
35 }
36 
37 unsigned nsingletons( const Sequence::SimData & d )
38 {
39  if(d.empty())return 0;
40  unsigned nsing = 0;
41  std::for_each(d.sbegin(),d.send(),
42  [&nsing,&d](const Sequence::polymorphicSite & __ps) {
43  auto c = std::count(__ps.second.begin(),__ps.second.end(),'1');
44  nsing += ( unsigned(c) == 1 || unsigned(c) == d.size() - 1 ) ? 1 : 0;
45  });
46  return nsing;
47 }
48 
49 unsigned nexternal( const Sequence::SimData & d )
50 {
51  if(d.empty())return 0;
52  unsigned next = 0;
53  std::for_each(d.sbegin(),d.send(),
54  [&next,&d](const Sequence::polymorphicSite & __ps) {
55  //In "Fu-ese", an external mutation is a derived singleton
56  next += ( std::count(__ps.second.begin(),__ps.second.end(),'1')==1 ) ? 1 : 0;
57  });
58  return next;
59 }
60 
61 //What do we do for no data?
62 BOOST_AUTO_TEST_CASE( check_empty_table )
63 {
65  Sequence::PolySIM ad(&d);
66 
67  //Estimators of theta = 4Neu, numbers of various types of polymorhpisms
68  BOOST_CHECK_EQUAL( ad.ThetaPi(), 0. );
69  BOOST_CHECK_EQUAL( ad.ThetaH(), 0. );
70  BOOST_CHECK_EQUAL( ad.ThetaL(), 0. );
71  BOOST_CHECK_EQUAL( ad.NumPoly(), 0 );
72  BOOST_CHECK_EQUAL( ad.NumMutations(), 0 );
73  BOOST_CHECK_EQUAL( ad.NumSingletons(), 0 );
74  BOOST_CHECK_EQUAL( ad.NumExternalMutations(), 0 );
75 
76  //Variance on theta estimators that no one in their right mind should use
77  //other than for teaching
78  BOOST_CHECK( std::isnan(ad.VarThetaW()) );
79  BOOST_CHECK( std::isnan(ad.VarPi()) );
80  BOOST_CHECK( std::isnan(ad.SamplingVarPi()) );
81  BOOST_CHECK( std::isnan(ad.StochasticVarPi()) );
82 
83  //Haplotype-related stats
84  /*
85  This may seem confusing. But, and empty PolyTable means no segregating sites.
86  But, PolyTables come after filtering. Further, a SimData is
87  assumed to be based on a sample of size n > 0, thus there is a minimum of 1
88  haplotype
89  */
90  BOOST_CHECK_EQUAL( ad.DandVK(), 1 ); // No. unique haps
91  BOOST_CHECK_EQUAL( ad.DandVH(), 0. ); // Haplotype diversity
92  BOOST_CHECK( std::isnan(ad.WallsB()) );
93  BOOST_CHECK( std::isnan(ad.WallsQ()) );
94  BOOST_CHECK_EQUAL( ad.WallsBprime(), 0. );
95  BOOST_CHECK_EQUAL( ad.Minrec(), std::numeric_limits<unsigned>::max() );
96 
97  //Classic summaries of the site-frequency spectrum
98  BOOST_CHECK( std::isnan(ad.TajimasD()) );
99  BOOST_CHECK( std::isnan(ad.Dnominator()) );
100  BOOST_CHECK( std::isnan(ad.FuLiD()) );
101  BOOST_CHECK( std::isnan(ad.FuLiF()) );
102  BOOST_CHECK( std::isnan(ad.FuLiDStar()) );
103  BOOST_CHECK( std::isnan(ad.FuLiFStar()) );
104  BOOST_CHECK( std::isnan(ad.Hprime()) );
105 
106  //Other stats
107  BOOST_CHECK( std::isnan(ad.HudsonsC()) );
108  BOOST_CHECK( ad.Disequilibrium().empty() );
109 }
110 
111 BOOST_AUTO_TEST_CASE( pi1 )
112 {
114  std::ifstream in(testfile);
115  BOOST_REQUIRE_NO_THROW(in >> d >> std::ws);
116  in.close();
117 
118  Sequence::PolySIM ad(&d);
119 
120  BOOST_REQUIRE_CLOSE( ad.ThetaPi(), pi(d), 0.001 );
121  BOOST_REQUIRE_EQUAL( ad.NumPoly(), d.numsites() ); //true by definition of the ms format
122 }
123 
124 BOOST_AUTO_TEST_CASE( S1 )
125 {
127  std::ifstream in(testfile);
128  BOOST_REQUIRE_NO_THROW(in >> d >> std::ws);
129  in.close();
130 
131  Sequence::PolySIM ad(&d);
132 
133  BOOST_REQUIRE_EQUAL( ad.NumPoly(), d.numsites() ); //true by definition of the ms format
134 }
135 
136 BOOST_AUTO_TEST_CASE( singletons1 )
137 {
139  std::ifstream in(testfile);
140  BOOST_REQUIRE_NO_THROW(in >> d >> std::ws);
141  in.close();
142 
143  Sequence::PolySIM ad(&d);
144 
145  BOOST_REQUIRE_EQUAL( ad.NumSingletons(), nsingletons(d) ); //true by definition of the ms format
146 }
147 
148 BOOST_AUTO_TEST_CASE( external1 )
149 {
151  std::ifstream in(testfile);
152  BOOST_REQUIRE_NO_THROW(in >> d >> std::ws);
153  in.close();
154 
155  Sequence::PolySIM ad(&d);
156 
157  BOOST_REQUIRE_EQUAL( ad.NumExternalMutations(), nexternal(d) ); //true by definition of the ms format
158 }
159 
160 //number of unique haps
161 BOOST_AUTO_TEST_CASE( nhaps1 )
162 {
164  std::ifstream in(testfile);
165  BOOST_REQUIRE_NO_THROW(in >> d >> std::ws);
166  in.close();
167 
168  Sequence::PolySIM ad(&d);
169  std::set<std::string> uhaps(d.begin(),d.end());
170  BOOST_REQUIRE_EQUAL( ad.DandVK(), uhaps.size() );
171 }
172 
173 //hap diversity
174 BOOST_AUTO_TEST_CASE( hapdiv )
175 {
177  std::ifstream in(testfile);
178  BOOST_REQUIRE_NO_THROW(in >> d >> std::ws);
179  in.close();
180 
181  Sequence::PolySIM ad(&d);
182  std::set<std::string> uhaps(d.begin(),d.end());
183  double hdiv = 0.; //Explicity hap. het
184  double hhom = 0.; //Explicity hap. homozygosity.
185  for( auto h : uhaps )
186  {
187  auto c = std::count( d.begin(),d.end(), h );
188  hdiv += ( double(c)/double(d.size()) )*(double(d.size()-c)/double(d.size()-1));
189  hhom += ( double(c)/double(d.size()) )*(double(c-1)/double(d.size()-1));
190  }
191  double hdiv2 = 1. - hhom;
192  double lseqvalue = ad.DandVH();
193  BOOST_REQUIRE_CLOSE( lseqvalue, hdiv, 1e-3);
194  BOOST_REQUIRE_CLOSE( lseqvalue, hdiv2, 1e-3);
195 }
196 BOOST_AUTO_TEST_SUITE_END()
declaration of Sequence::PolySIM, a class to analyze coalescent simulation data
Declaration of Sequence::SimData, a class representing polymorphism data from coalescent simulations ...
Data from coalescent simulations.
Analysis of coalescent simulation data.
std::pair< double, std::string > polymorphicSite