libsequence  1.9.5
haplotype_statistics.cc
1 #include <algorithm>
2 #include <vector>
3 #include <cstdint>
4 #include <unordered_map>
7 #include <Sequence/VariantMatrix.hpp>
8 #include <Sequence/VariantMatrixViews.hpp>
9 
10 namespace Sequence
11 {
12  //TODO: modify to ignore sequences
13  //with missing data above a certain threshold?
14  std::vector<std::int32_t>
16  {
17  std::vector<std::int32_t> rv;
18  rv.reserve(m.nsam);
19  for (std::size_t i = 0; i < m.nsam - 1; ++i)
20  {
21  auto ci = get_ConstColView(m, i);
22  for (std::size_t j = i + 1; j < m.nsam; ++j)
23  {
24  auto cj = get_ConstColView(m, j);
25  std::int32_t ndiffs = 0;
26  auto cib = ci.begin(), cjb = cj.begin();
27  //If both sites are not missing
28  //and not equal, they are different
29  while (cib != ci.end())
30  {
31  if (*cib >= 0 && *cjb >= 0 && *cib != *cjb)
32  {
33  ++ndiffs;
34  }
35  ++cib;
36  ++cjb;
37  }
38  rv.push_back(ndiffs);
39  }
40  }
41  return rv;
42  }
43 
44  std::vector<std::int32_t>
46  {
47  std::vector<std::int32_t> rv;
48  rv.reserve(m.nsam);
49  for (std::size_t i = 0; i < m.nsam - 1; ++i)
50  {
51  auto ci = get_ConstColView(m, i);
52  for (std::size_t j = i + 1; j < m.nsam; ++j)
53  {
54  auto cj = get_ConstColView(m, j);
55  auto cib = ci.begin(), cjb = cj.begin();
56  std::int32_t different = 0;
57  //If both sites are not missing
58  //and not equal, they are different
59  while (cib != ci.end() && !different)
60  {
61  if (*cib >= 0 && *cjb >= 0 && *cib != *cjb)
62  {
63  different = 1;
64  }
65  ++cib;
66  ++cjb;
67  }
68  rv.push_back(different);
69  }
70  }
71  return rv;
72  }
73 
74  std::vector<std::int32_t>
76  {
77  std::vector<std::int32_t> rv(m.nsam, -1);
78  if (rv.empty())
79  {
80  return rv;
81  }
82  rv.reserve(m.nsam);
83  const auto dm = is_different_matrix(m);
84  auto dmi = dm.cbegin();
85  int next_label = 0;
86  // We got all the way to nsam for the
87  // case where the last haplotype is unique.
88  // In 1.9.4, we went to nsam-1, which was wrong.
89  // Fixed in 1.9.5
90  for (std::size_t i = 0; i < m.nsam; ++i)
91  {
92  if (rv[i] < 0)
93  {
94  if (!all_missing(get_ConstColView(m, i)))
95  {
96  rv[i] = next_label;
97  for (std::size_t j = i + 1; j < m.nsam;
98  ++j, ++dmi)
99  {
100  if (!all_missing(
101  get_ConstColView(m, j)))
102  {
103  if (*dmi == 0)
104  {
105  rv[j] = next_label;
106  }
107  }
108  }
109  ++next_label;
110  }
111  }
112  }
113  return rv;
114  }
115 
116  std::int32_t
118  {
119  if (m.data.empty() || !m.nsam)
120  {
121  return -1;
122  }
123  auto labels = label_haplotypes(m);
124  labels.erase(
125  std::remove_if(labels.begin(), labels.end(),
126  [](decltype(labels[0]) x) { return x < 0; }),
127  labels.end());
128  std::sort(labels.begin(), labels.end());
129  auto u = std::unique(labels.begin(), labels.end());
130  return static_cast<std::int32_t>(std::distance(labels.begin(), u));
131  }
132 
133  double
135  {
136  if (m.data.empty() || !m.nsam)
137  {
138  return std::numeric_limits<double>::quiet_NaN();
139  }
140  auto labels = label_haplotypes(m);
141  auto nmissing
142  = std::count_if(labels.begin(), labels.end(),
143  [](decltype(labels[0]) x) { return x < 0; });
144  auto nsam_adjusted = m.nsam - static_cast<decltype(m.nsam)>(nmissing);
145  labels.erase(
146  std::remove_if(labels.begin(), labels.end(),
147  [](decltype(labels[0]) x) { return x < 0; }),
148  labels.end());
149  std::unordered_map<std::int32_t, std::int32_t> label_counts;
150  for (auto l : labels)
151  {
152  label_counts[l]++;
153  }
154  return diversity_from_counts(label_counts, nsam_adjusted);
155  }
156 
157 } // namespace Sequence
double diversity_from_counts(const std::unordered_map< std::int32_t, std::int32_t > &counts, const std::size_t nsam)
Calculate heterozygosity/diversity from count data.
Definition: generic.cc:10
std::int32_t number_of_haplotypes(const VariantMatrix &m)
Calculate the number of haplotypes in a sample.
bool all_missing(const T &t)
Definition: util.hpp:18
The namespace in which this library resides.
std::vector< std::int32_t > label_haplotypes(const VariantMatrix &m)
Assign a unique label to each haplotype.
Generic utilities for calculating summary statistics.
Helper functions for implementing summary statistics.
Matrix representation of variation data.
std::vector< std::int32_t > is_different_matrix(const VariantMatrix &m)
std::size_t nsam
Sample size of data set.
ConstColView get_ConstColView(VariantMatrix &m, const std::size_t col)
Return a ConstColView from VariantMatrix m at col col. std::out_of_range is thcoln if col is out of r...
std::vector< std::int8_t > data
Data stored in matrix form with rows as sites.
double haplotype_diversity(const VariantMatrix &m)
Calculate the haplotype diversity of a sample.
std::vector< std::int32_t > difference_matrix(const VariantMatrix &m)
Calculate number of differences between all samples.