libsequence  1.9.5
nSL.cc
1 #include <Sequence/SimData.hpp>
2 #include <Sequence/SummStats/nSL.hpp>
3 #include <algorithm>
4 #include <numeric>
5 #include <array>
6 #include <cmath>
7 #include <limits>
8 #include <unordered_map>
9 // For parallelizing nSL:
10 #include <functional>
11 #include <iostream>
12 
13 using namespace std;
14 
15 namespace
16 {
17  inline double
18  maxabs(double score, double mean, double sd, double rv)
19  {
20  if (isfinite(score))
21  {
22  double zscore = (score - mean) / sd;
23  if (!isfinite(rv) || fabs(zscore) > fabs(rv))
24  return zscore;
25  }
26  return rv;
27  }
28 
29  double
30  update_s2(std::string::const_iterator start,
31  std::string::const_iterator left,
32  std::string::const_iterator right, const Sequence::SimData &d,
33  const std::unordered_map<double, double> &gmap)
34  {
35  auto p1 = d.position(
36  std::vector<double>::size_type(distance(start, right)) - 1);
37  auto p2 = d.position(
38  std::vector<double>::size_type(distance(start, left)));
39  if (gmap.empty())
40  {
41  // return phyisical distance
42  return fabs(p1 - p2);
43  }
44  // return distance along genetic map,
45  // in whatever units those are.
46  auto fp1 = gmap.find(p1);
47  auto fp2 = gmap.find(p2);
48  if (fp1 == gmap.end() || fp2 == gmap.end())
49  {
50  throw std::runtime_error(
51  "position could not be found in genetic map, "
52  + std::string(__FILE__) + " line "
53  + std::to_string(__LINE__));
54  }
55  return fabs(fp1->second - fp2->second);
56  }
57  /*
58  Mechanics of the nSL statistic
59 
60  RV = nSL,iHS, as defined in doi:10.1093/molbev/msu077
61  */
62  std::array<double, 4>
63  __nSLdetails(const std::size_t &core, const Sequence::SimData &d,
64  // const vector<size_t> &coretype,
65  const std::unordered_map<double, double> &gmap)
66  {
67  auto csize = d.size();
68  // This tracks s,s2 for ancestral and derived
69  // mutation, resp:
70  std::array<double, 4> rv = { 0., 0., 0., 0. };
71  // number of comparisons for ancestral and
72  // derived, resp:
73  std::array<unsigned, 2> nc = { 0u, 0u };
74  for (size_t i = 0; i < csize; ++i)
75  {
76  auto start = d[i].cbegin();
77  auto bi
78  = start
79  + static_cast<decltype(start)::difference_type>(core);
80  size_t iIsDer = (*bi == '1');
81  for (size_t j = i + 1; j < csize; ++j)
82  {
83  auto bj
84  = d[j].cbegin()
85  + static_cast<decltype(start)::difference_type>(
86  core);
87  size_t jIsDer = (*bj == '1');
88  if (iIsDer == jIsDer)
89  {
90  auto eri = d[i].crend();
91  auto ei = d[i].cend();
92  auto right = mismatch(bi, ei, bj);
93  string::const_reverse_iterator ri1(bi),
94  ri2(bj);
95  auto left = mismatch(ri1, eri, ri2);
96  if (left.first != eri && right.first != ei)
97  {
98  rv[2 * iIsDer] += static_cast<double>(
99  distance(left.first.base(),
100  right.first)
101  + 1);
102  rv[2 * iIsDer + 1] += update_s2(
103  start, left.first.base(),
104  right.first, d, gmap);
105  nc[iIsDer]++;
106  }
107  }
108  }
109  }
110  rv[0] /= static_cast<double>(nc[0]);
111  rv[1] /= static_cast<double>(nc[0]);
112  rv[2] /= static_cast<double>(nc[1]);
113  rv[3] /= static_cast<double>(nc[1]);
114  return rv;
115  }
116 }
117 
118 namespace Sequence
119 {
120  /*
121  The nSL statistic of doi:10.1093/molbev/msu077
122  */
123  pair<double, double>
124  nSL(const std::size_t &core, const SimData &d,
125  const std::unordered_map<double, double> &gmap)
126  {
127  auto nsl = __nSLdetails(core, d, gmap);
128  return make_pair(log(nsl[0]) - log(nsl[2]), log(nsl[1]) - log(nsl[3]));
129  }
130 
131  // double
132  // update_return_value(vector<double> &binned_scores, const double
133  // current_rv)
135  //{
136  // if (binned_scores.size() > 1)
137  // {
138  // double sum = accumulate(binned_scores.begin(),
139  // binned_scores.end(), 0.);
140  // double sumsq = accumulate(
141  // binned_scores.begin(), binned_scores.end(), 0.,
142  // [](double ss, double val) { return ss + pow(val, 2.0);
143  // });
144  // double mean = sum /
145  // static_cast<double>(binned_scores.size());
146  // double sd = sqrt(sumsq);
147  // transform(binned_scores.begin(), binned_scores.end(),
148  // binned_scores.begin(), [mean, sd](double score) {
149  // return (score - mean) / sd;
150  // });
151  // auto me = max_element(
152  // binned_scores.begin(), binned_scores.end(),
153  // [](double a, double b) { return fabs(a) < fabs(b); });
154  // if (!isfinite(current_rv) || fabs(*me) > fabs(current_rv))
155  // {
156  // return fabs(*me);
157  // }
158  // }
159  // return current_rv;
160  //}
161  /*
162  Return max. abs value of standardized nSL and iHS, with the latter as
163  defined by Ferrer-Admetella et al.
164  */
165  // pair<double, double>
166  // snSL(const SimData &d, const double minfreq, const double binsize,
167  // bool filter_minor, const std::unordered_map<double, double> &gmap)
168  //{
169  // if (d.empty())
170  // return make_pair(std::numeric_limits<double>::quiet_NaN(),
171  // std::numeric_limits<double>::quiet_NaN());
172 
173  // vector<unsigned> dcounts;
174  // dcounts.reserve(d.numsites());
175  // vector<size_t> core_snps;
176 
177  // for (auto p = d.sbegin(); p != d.send(); ++p)
178  // {
179  // unsigned dcount = static_cast<unsigned>(
180  // count(p->second.begin(), p->second.end(), '1'));
181  // if (dcount && dcount < d.size())
182  // {
183  // double f = static_cast<double>(dcount)
184  // / static_cast<double>(d.size());
185  // if (filter_minor)
186  // {
187  // f = min(f, 1. - f);
188  // dcount = min(dcount,
189  // static_cast<unsigned>(d.size())
190  // - dcount);
191  // }
192  // if (f >= minfreq)
193  // {
194  // core_snps.push_back(
195  // static_cast<size_t>(p - d.sbegin()));
196  // dcounts.push_back(dcount);
197  // }
198  // }
199  // }
200  // if (core_snps.empty())
201  // return make_pair(std::numeric_limits<double>::quiet_NaN(),
202  // std::numeric_limits<double>::quiet_NaN());
203  // // Get the stats
204  // auto nSLstats = nSL_t(d, core_snps, gmap);
205  // const std::size_t nbins = static_cast<size_t>(
206  // std::ceil(((filter_minor) ? 0.5 : 1.0) / binsize));
207  // vector<vector<double>> binned_scores_nSL(nbins),
208  // binned_scores_iHS(nbins);
209  // double binsize_counts = (binsize * double(d.size()));
210  // for (size_t i = 0; i < core_snps.size(); ++i)
211  // {
212  // size_t bin
213  // = size_t(static_cast<double>(dcounts[i]) /
214  // binsize_counts);
215  // if (isfinite(get<0>(nSLstats[i])))
216  // {
217  // binned_scores_nSL[bin].push_back(get<0>(nSLstats[i]));
218  // }
219  // if (isfinite(get<1>(nSLstats[i])))
220  // {
221  // binned_scores_iHS[bin].push_back(get<1>(nSLstats[i]));
222  // }
223  // }
224  // double rv_nSL = numeric_limits<double>::quiet_NaN(),
225  // rv_iHS = numeric_limits<double>::quiet_NaN();
226  // for (size_t i = 0; i < binned_scores_nSL.size(); ++i)
227  // {
228  // rv_nSL = update_return_value(binned_scores_nSL[i], rv_nSL);
229  // rv_iHS = update_return_value(binned_scores_iHS[i], rv_iHS);
230  // }
231  // return make_pair(rv_nSL, rv_iHS);
232  //}
233 }
std::pair< double, double > nSL(const std::size_t &core, const SimData &d, const std::unordered_map< double, double > &gmap=std::unordered_map< double, double >()) __attribute__((deprecated))
Definition: nSL.cc:124
STL namespace.
The namespace in which this library resides.
double mean(iterator beg, iterator end)
nSLiHS nsl(const VariantMatrix &m, const std::size_t core, const std::int8_t refstate)
nSL and iHS statistics
Definition: nsl.cc:76
Declaration of Sequence::SimData, a class representing polymorphism data from coalescent simulations ...
Data from coalescent simulations.