libsequence  1.9.5
FST.cc
1 /*
2 
3  Copyright (C) 2003-2009 Kevin Thornton, krthornt[]@[]uci.edu
4 
5  Remove the brackets to email me.
6 
7  This file is part of libsequence.
8 
9  libsequence is free software: you can redistribute it and/or modify
10  it under the terms of the GNU General Public License as published by
11  the Free Software Foundation, either version 3 of the License, or
12  (at your option) any later version.
13 
14  libsequence is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  GNU General Public License for more details.
18 
19  You should have received a copy of the GNU General Public License
20  long with libsequence. If not, see <http://www.gnu.org/licenses/>.
21 
22 */
23 
24 #include <Sequence/FST.hpp>
25 #include <Sequence/PolyTable.hpp>
28 #include <algorithm>
29 #include <numeric>
30 #include <vector>
31 #include <cmath>
32 #include <limits>
33 #include <stdexcept>
40 using std::accumulate;
41 using std::vector;
42 using std::pair;
43 
44 namespace Sequence
45 {
46 #ifndef DOXYGEN_SKIP //doxygen should skip this
47 
48  struct FSTimpl
49  {
50  typedef std::vector<Sequence::stateCounter> _vCounts;
51  typedef std::vector< _vCounts > _vvCounts;
52  _vvCounts _Counts;
53  unsigned _nsam, _nsites, _npop;
54  double _piB, _piT, _piS, _piD;
55  vector<unsigned> _config;
56  vector<double> _weights;
57  vector< pair<unsigned,unsigned> > _indexes;
58  bool _calcsDone;
59  polySiteVector pv;
60  typedef std::pair< std::set<char>, std::set<char> > PopStateSets;
61  PopStateSets getPopStateSets(const unsigned &i,
62  const unsigned &j,
63  const unsigned & site);
64  FSTimpl(const PolyTable *data, unsigned npop, const unsigned *config,
65  const double *weights, bool haveOutgroup, unsigned outgroup);
66  void doCalcs(void);
67  };
68 
69  FSTimpl::FSTimpl(const PolyTable *data, unsigned npop, const unsigned *config,
70  const double *weights, bool haveOutgroup, unsigned outgroup) :
71  _Counts( _vvCounts(npop,_vCounts(data->numsites()))),
72  _nsam(unsigned(data->size())),
73  _nsites(data->numsites()),_npop(npop),
74  _piB(0.), _piT(0.), _piS(0.), _piD(0.),
75  _indexes( vector< pair<unsigned,unsigned> >(npop) ),
76  _calcsDone(false),
77  pv( make_polySiteVector(*data) )
78  {
79  if (config == NULL)
80  {
81  throw std::runtime_error("Seqence::FST -- config vector is NULL");
82  }
83  else
84  {
85  _config.assign(config,config+npop);
86  if(accumulate(_config.begin(),_config.end(),
87  unsigned(0))+unsigned(haveOutgroup) != _nsam)
88  {
89  throw std::runtime_error("Seqence::FST -- sum of elements in config does not equal the sample size stored in data");
90  }
91  }
92  if (weights == NULL)
93  //equally weight all populations
94  {
95  _weights.assign(_npop, 1./double(_npop));
96  }
97  else
98  {
99  _weights.assign(weights,weights+npop);
100  double sum = accumulate(_weights.begin(),_weights.end(),0.);
101  if ( std::fabs(sum-1.) > std::numeric_limits<double>::epsilon() )
102  {
103  throw std::runtime_error("Seqence::FST -- weights do not sum to 1");
104  }
105  }
106 
107  _indexes[0] = pair<unsigned,unsigned>(0,_config[0]);
108  for(unsigned i = 1 ; i < _npop ; ++i)
109  {
110  _indexes[i] = pair<unsigned,unsigned>(_indexes[i-1].second,
111  _indexes[i-1].second+_config[i]);
112  }
113  for (unsigned i = 0 ; i < _npop ; ++i) //over pops
114  {
115  for(unsigned j = 0 ; j < _nsites ; ++j)//over sites
116  {
117  _Counts[i][j] = stateCounter('-');
118  for(unsigned k = _indexes[i].first ; k < _indexes[i].second ; ++k)//over sequences
119  {
120  if (haveOutgroup == false ||
121  (haveOutgroup == true && k != outgroup))
122  _Counts[i][j]((*data)[k][j]);
123  }
124  }
125  }
126 
127  if (haveOutgroup == true)
128  {
129  //remove outgroup state from pv
130  for(unsigned i=0;i<pv.size();++i)
131  {
132  pv[i].second.erase(outgroup,1);
133  }
134  }
135  this->doCalcs();
136  }
137 
138  FSTimpl::PopStateSets FSTimpl::getPopStateSets(const unsigned &pop1,
139  const unsigned &pop2,
140  const unsigned & site)
141  {
142  typedef std::string::difference_type UTYPE;
143  UTYPE beg1 = std::accumulate(_config.begin(),
144  _config.begin()+pop1,
145  0u);
146  UTYPE size1 = *(_config.begin()+pop1);
147  UTYPE beg2 = std::accumulate(_config.begin(),
148  _config.begin()+pop2,
149  0u);
150  UTYPE size2 = *(_config.begin()+pop2);
151 
152  //we need some rigamarole here to skip missing data
153  vector<char> ch1(pv[site].second.begin()+beg1,
154  pv[site].second.begin()+beg1+size1),
155  ch2(pv[site].second.begin()+beg2,
156  pv[site].second.begin()+beg2+size2);
157  ch1.erase( std::remove(ch1.begin(),ch1.end(),'N'),ch1.end() );
158  ch2.erase( std::remove(ch2.begin(),ch2.end(),'N'),ch2.end() );
159  std::set<char> states1(ch1.begin(),ch1.end()),
160  states2(ch2.begin(),ch2.end());
161  return std::make_pair(states1,states2);
162  }
163 #endif
164  FST::FST(const PolyTable *data, unsigned npop, const unsigned *config,
165  const double *weights, bool haveOutgroup, unsigned outgroup)
166 
167 
168 
184  {
185  try
186  {
187  impl = std::unique_ptr<FSTimpl>(new FSTimpl(data,npop,config,weights,haveOutgroup,outgroup));
188  }
189  catch(std::runtime_error &e)
190  {
191  throw;
192  }
193  catch (...)
194  {
195  throw (std::runtime_error("Sequence::FST : unknown exception caught by constructor"));
196  }
197  }
198 
199  FST::~FST(void)
200  {
201  //delete impl;
202  }
203 
204  void FSTimpl::doCalcs(void)
205  {
206  //calculate sums of w/in population weights and heterozygosity
207  double w_ii_sq = 0., weighted_Pi_ii = 0.;
208 
209  for (unsigned i = 0 ; i < _npop ; ++i) //over pops
210  {
211  double Pi = 0.;
212  w_ii_sq += _weights[i]*_weights[i];
213  for(unsigned j = 0 ; j < _nsites ; ++j)//over sites
214  {
215  unsigned n = _config[i];
216  double SSH = 0.;
217  n -= _Counts[i][j].n;
218  double denom = (double(n)* (double(n) - 1.0));
219  SSH += (_Counts[i][j].a > 0) ? double(_Counts[i][j].a) *
220  double (_Counts[i][j].a-1) /denom : 0. ;
221  SSH += (_Counts[i][j].g > 0) ? double(_Counts[i][j].g) *
222  double (_Counts[i][j].g-1) /denom : 0. ;
223  SSH += (_Counts[i][j].c > 0) ? double(_Counts[i][j].c) *
224  double (_Counts[i][j].c-1) /denom : 0. ;
225  SSH += (_Counts[i][j].t > 0) ? double(_Counts[i][j].t) *
226  double (_Counts[i][j].t-1) /denom : 0. ;
227  SSH += (_Counts[i][j].zero > 0) ? double(_Counts[i][j].zero) *
228  double (_Counts[i][j].zero-1) /denom : 0. ;
229  SSH += (_Counts[i][j].one > 0) ? double(_Counts[i][j].one) *
230  double (_Counts[i][j].one-1) /denom : 0. ;
231  Pi += (1.0 - SSH);
232  }//sites
233  weighted_Pi_ii += _weights[i]*_weights[i]*Pi;
234  }//pops
235 
236  //now calculate between-population divergence,
237  //using stored state info in _Counts
238  double sum_wi_wj = 0., weighted_Pi_ij = 0.;
239  for(unsigned i = 0 ; i < _npop - 1 ; ++i)//over pops_i
240  {
241  for(unsigned j = i+1 ; j < _npop ; ++j)//over pops_j
242  {
243  double Pi_i_j = 0.;
244  sum_wi_wj += _weights[i]*_weights[j];
245  for(unsigned k = 0 ; k < _nsites ; ++k)//over sites
246  {
247  unsigned ni = _config[i],nj = _config[j];
248  ni -= _Counts[i][k].n;
249  nj -= _Counts[j][k].n;
250  Pi_i_j += (_Counts[i][k].a > 0) ?
251  (double(_Counts[i][k].a)/double(ni))*
252  (double(nj-_Counts[j][k].a)/double(nj)):0.;
253  Pi_i_j += (_Counts[i][k].g > 0) ?
254  (double(_Counts[i][k].g)/double(ni))*
255  (double(nj-_Counts[j][k].g)/double(nj)):0.;
256  Pi_i_j += (_Counts[i][k].c > 0) ?
257  (double(_Counts[i][k].c)/double(ni))*
258  (double(nj-_Counts[j][k].c)/double(nj)):0.;
259  Pi_i_j += (_Counts[i][k].t > 0) ?
260  (double(_Counts[i][k].t)/double(ni))*
261  (double(nj-_Counts[j][k].t)/double(nj)):0.;
262  Pi_i_j += (_Counts[i][k].zero > 0) ?
263  (double(_Counts[i][k].zero)/double(ni))*
264  (double(nj-_Counts[j][k].zero)/double(nj)):0.;
265  Pi_i_j += (_Counts[i][k].one > 0) ?
266  (double(_Counts[i][k].one)/double(ni))*
267  (double(nj-_Counts[j][k].one)/double(nj)):0.;
268  }//sites
269  weighted_Pi_ij += _weights[i]*_weights[j]*Pi_i_j;
270  }//pops_j
271  }//pops_i
272 
273  //calculate measures of diversity/divergence needed to obtain Fst
274  _piT = weighted_Pi_ii + 2.*weighted_Pi_ij;
275  _piS = weighted_Pi_ii / w_ii_sq ;
276  _piB = weighted_Pi_ij / sum_wi_wj;
277  _piD = (_piT - _piS)/(2. * sum_wi_wj);
278 
279  _calcsDone = true;
280  }
281 
282  std::set<double> FST::shared(unsigned pop1, unsigned pop2) const
289  {
290  if (pop1 > impl->_npop-1 || pop2 > impl->_npop-1)
291  throw std::out_of_range("Seqence::FST -- indexes out of range");
292 
293  std::set<double> sharedList;
294 
295  for(unsigned site = 0 ; site < impl->_nsites ; ++site)
296  {
297  if (impl->_Counts[pop1][site].gap == 0 &&
298  impl->_Counts[pop2][site].gap == 0)
299  {
300  FSTimpl::PopStateSets states = impl->getPopStateSets(pop1,pop2,site);
301  if (states.first.size() > 1 && states.second.size() > 1)
302  {
303  vector<char> overlap(states.first.size()+states.second.size());
304  vector<char>::iterator itr = std::set_intersection(states.first.begin(),
305  states.first.end(),
306  states.second.begin(),
307  states.second.end(),
308  overlap.begin());
309  if ( itr - overlap.begin() > 0) //shared states exist
310  {
311  sharedList.insert(impl->pv[site].first);
312  }
313  }
314  }
315  }
316  return sharedList;
317  }
318 
319  std::set<double> FST::fixed(unsigned pop1, unsigned pop2) const
326  {
327  if (pop1 > impl->_npop-1 || pop2 > impl->_npop-1)
328  throw std::out_of_range("Seqence::FST -- indexes out of range");
329  std::set<double> fixedList;
330  for(unsigned site = 0 ; site < impl->_nsites ; ++site)
331  {
332  if (impl->_Counts[pop1][site].gap == 0 &&
333  impl->_Counts[pop2][site].gap == 0)
334  {
335  FSTimpl::PopStateSets states = impl->getPopStateSets(pop1,pop2,site);
336  vector<char> overlap(states.first.size()+states.second.size());
337  vector<char>::iterator itr = std::set_intersection(states.first.begin(),
338  states.first.end(),
339  states.second.begin(),
340  states.second.end(),
341  overlap.begin());
342  if ( itr - overlap.begin() == 0) //no shared states exist
343  {
344  fixedList.insert(impl->pv[site].first);
345  }
346  }
347  }
348  return fixedList;
349  }
350 
351  std::pair< std::set<double>,std::set<double> >
352  FST::Private(unsigned pop1, unsigned pop2) const
362  {
363  if (pop1 > impl->_npop-1 || pop2 > impl->_npop-1)
364  throw std::out_of_range("Seqence::FST -- indexes out of range");
365 
366  std::set<double> p1,p2;
367  for(unsigned site = 0 ; site < impl->_nsites ; ++site)
368  {
369  if (impl->_Counts[pop1][site].gap == 0 &&
370  impl->_Counts[pop2][site].gap == 0)
371  {
372  FSTimpl::PopStateSets states = impl->getPopStateSets(pop1,pop2,site);
373 
374  //now, use std::set_difference to identify private polys
375  vector<char> priv1(states.first.size()),priv2(states.second.size());
376  //private in pop1
377  vector<char>::iterator itr1 = std::set_difference(states.first.begin(),states.first.end(),
378  states.second.begin(),states.second.end(),
379  priv1.begin());
380  //private in pop2
381  vector<char>::iterator itr2 = std::set_difference(states.second.begin(),states.second.end(),
382  states.first.begin(),states.first.end(),
383  priv2.begin());
384  //if there are unique states && site is polymorphic,
385  //then there is a private poly
386  if ( (itr1-priv1.begin())>0 && states.first.size()>1)
387  {
388  p1.insert(impl->pv[site].first);
389  }
390  if ( (itr2-priv2.begin())>0 && states.second.size()>1)
391  {
392  p2.insert(impl->pv[site].first);
393  }
394  }
395  }
396  return std::make_pair(p1,p2);
397  }
398 
399  double FST::HSM(void) const
405  {
406  return impl->_piD/(impl->_piS+impl->_piD);
407  }
408 
409  double FST::Slatkin(void) const
415  {
416  return impl->_piD/(2.*impl->_piS + impl->_piD);
417  }
418 
419  double FST::HBK(void) const
426  {
427  return 1.-(impl->_piS/impl->_piT);
428  }
429 
430  double FST::piB(void) const
435  {
436  return impl->_piB;
437  }
438 
439  double FST::piT(void) const
444  {
445  return impl->_piT;
446  }
447 
448  double FST::piS(void) const
453  {
454  return impl->_piS;
455  }
456 
457  double FST::piD(void) const
464  {
465  return impl->_piD;
466  }
467 }
polySiteVector make_polySiteVector(const Sequence::PolyTable &data) __attribute__((deprecated))
delcaration of a class (Sequence::FST) to analyze population structure
Sequence::PolyTable, a virtual base class for polymorphism tables.
The namespace in which this library resides.
std::vector< polymorphicSite > polySiteVector
Site-major variation tables in ASCII format.