libsequence  1.9.5
PolyTableTweaking.cc
1 #include <Sequence/PolySites.hpp>
2 #include <Sequence/Fasta.hpp>
5 #include <boost/test/unit_test.hpp>
6 #include <cstdio>
7 #include <cstdlib>
8 #include <cctype>
9 #include <iostream>
10 #include <functional>
11 
12 using namespace Sequence;
13 //Removal of N
14 BOOST_AUTO_TEST_SUITE(PolyTableTweakingTest)
15 
16 BOOST_AUTO_TEST_CASE( remove_missing_N )
17 {
18  std::vector<double> pos = {1,2,3,4,5};
19  std::vector<std::string> data = {"AAAAA",
20  "AAGAA",
21  "CTGAA",
22  "NAACT"};
23 
24  Sequence::PolySites ps(std::move(pos),std::move(data)),
25  ps2(copyPolyTable(ps)),ps3(copyPolyTable(ps)),ps4(copyPolyTable(ps));
26 
27  BOOST_CHECK(pos.empty());
28  BOOST_CHECK(data.empty());
29 
30  BOOST_REQUIRE(ps == ps2);
31 
32  BOOST_REQUIRE_EQUAL( ps.numsites() , 5 );
33  BOOST_REQUIRE_EQUAL( ps.size(), 4 );
34 
35  ps = removeMissing(ps);
36 
37  BOOST_REQUIRE_EQUAL( ps.numsites() , 4 );
38  BOOST_REQUIRE_EQUAL( ps.size(), 4 );
39 
40  //Don't remove missing data from the outgroup
41  //outgroup is seq w/missing
42  ps2=removeMissing(ps2,true,3);
43 
44  BOOST_REQUIRE_EQUAL( ps2.numsites() , 5 );
45  BOOST_REQUIRE_EQUAL( ps2.size(), 4 );
46 
47  //Do remove missing data from the outgroup
48  //outgroup is seq w/missing data
49  ps2=removeMissing(ps2,false,3);
50 
51  BOOST_REQUIRE_EQUAL( ps2.numsites() , 4 );
52  BOOST_REQUIRE_EQUAL( ps2.size(), 4 );
53 
54  //Don't remove missing data based on outgroup
55  //outgroup does not have missing data
56  ps3=removeMissing(ps3,false,0);
57  BOOST_REQUIRE_EQUAL( ps3.numsites() , 4 );
58  BOOST_REQUIRE_EQUAL( ps3.size(), 4 );
59 
60  BOOST_REQUIRE( ps2 == ps3 );
61 
62  //Do remove missing data based on outgroup
63  //outgroup does not have missing data
64  ps4=removeMissing(ps4,false,0);
65  BOOST_REQUIRE_EQUAL( ps4.numsites() , 4 );
66  BOOST_REQUIRE_EQUAL( ps4.size(), 4 );
67 
68  BOOST_REQUIRE( ps3 == ps4 );
69 }
70 
71 //remove haplotypes with missing data
72 // BOOST_AUTO_TEST_CASE( remove_missing_Nhap )
73 // {
74 // std::vector<double> pos = {1,2,3,4,5};
75 // std::vector<std::string> data = {"AAAAA",
76 // "AAGAA",
77 // "CTGAA",
78 // "NAACT"};
79 
80 // Sequence::PolySites ps(std::move(pos),std::move(data));
81 // ps.second.erase( std::remove_if(ps.begin(),
82 // ps.end(),
83 // [](const std::string & __s) {
84 // return __s.find('N') != std::string::npos;
85 // }),
86 // ps.end() );
87 // BOOST_CHECK_EQUAL( ps.size(), 3 );
88 // }
89 
90 //What about lower-case data?
91 // BOOST_AUTO_TEST_CASE( remove_missing_n )
92 // {
93 // std::vector<double> pos = {1,2,3,4,5};
94 // std::vector<std::string> data = {"aaaaa",
95 // "AAGAA",
96 // "CTGAA",
97 // "NAACT"};
98 
99 // Sequence::PolySites ps(std::move(pos),std::move(data));
100 
101 // for( auto & d : ps )
102 // {
103 // std::transform(d.begin(),
104 // d.end(),
105 // d.begin(),
106 // [](char & ch) { return std::tolower(ch); });
107 // }
108 
109 // Sequence::PolySites ps2(ps),ps3(ps),ps4(ps);
110 
111 // BOOST_REQUIRE(ps == ps2);
112 
113 // BOOST_REQUIRE_EQUAL( ps.numsites() , 5 );
114 // BOOST_REQUIRE_EQUAL( ps.size(), 4 );
115 
116 // ps.RemoveMissing();
117 
118 // BOOST_REQUIRE_EQUAL( ps.numsites() , 4 );
119 // BOOST_REQUIRE_EQUAL( ps.size(), 4 );
120 
121 // //Don't remove missing data from the outgroup
122 // //outgroup is seq w/missing
123 // ps2.RemoveMissing(true,3);
124 
125 // BOOST_REQUIRE_EQUAL( ps2.numsites() , 5 );
126 // BOOST_REQUIRE_EQUAL( ps2.size(), 4 );
127 
128 // //Do remove missing data from the outgroup
129 // //outgroup is seq w/missing data
130 // ps2.RemoveMissing(false,3);
131 
132 // BOOST_REQUIRE_EQUAL( ps2.numsites() , 4 );
133 // BOOST_REQUIRE_EQUAL( ps2.size(), 4 );
134 
135 // //Don't remove missing data based on outgroup
136 // //outgroup does not have missing data
137 // ps3.RemoveMissing(false,0);
138 // BOOST_REQUIRE_EQUAL( ps3.numsites() , 4 );
139 // BOOST_REQUIRE_EQUAL( ps3.size(), 4 );
140 
141 // BOOST_REQUIRE( ps2 == ps3 );
142 
143 // //Do remove missing data based on outgroup
144 // //outgroup does not have missing data
145 // ps4.RemoveMissing(false,0);
146 // BOOST_REQUIRE_EQUAL( ps4.numsites() , 4 );
147 // BOOST_REQUIRE_EQUAL( ps4.size(), 4 );
148 
149 // BOOST_REQUIRE( ps3 == ps4 );
150 // }
151 
152 //Like "nextgen" data, right?
153 BOOST_AUTO_TEST_CASE( remove_missing_extreme )
154 {
155  std::vector<double> pos = {1,2,3,4,5};
156  std::vector<std::string> data = {"aaaaa",
157  "AAGAA",
158  "CTGAA",
159  "NAACT"};
160 
161  Sequence::PolySites ps(std::move(pos),std::move(data));
162 
163  //convert it all to missing
164  for( auto & d : ps )
165  {
166  std::for_each(d.begin(),
167  d.end(),
168  [](char & ch) { ch = 'N'; });
169  }
170 
171  ps=removeMissing(ps);
172  BOOST_REQUIRE(ps.empty());
173  BOOST_REQUIRE_EQUAL(ps.numsites(),0);
174  BOOST_REQUIRE_EQUAL(ps.size(),0);
175 }
176 
177 //operator== is case-sensitive!
178 BOOST_AUTO_TEST_CASE( to_lower )
179 {
180  std::vector<double> pos = {1,2,3,4,5};
181  std::vector<std::string> data = {"AAAAA",
182  "AAGAA",
183  "CTGAA",
184  "NAACT"};
185 
186  Sequence::PolySites ps(std::move(pos),std::move(data)),
187  ps2(copyPolyTable(ps));
188 
189  for( auto & d : ps )
190  {
191  std::transform(d.begin(),
192  d.end(),
193  d.begin(),
194  [](char & ch) { return std::tolower(ch); });
195  }
196 
197  //Now, ps and ps2 will not be equal
198  BOOST_REQUIRE( ps != ps2 );
199 
200  for( auto & d : ps )
201  {
202  std::transform(d.begin(),
203  d.end(),
204  d.begin(),
205  [](char & ch) { return std::toupper(ch); });
206  }
207 
208  BOOST_REQUIRE( ps == ps2 );
209 }
210 
211 BOOST_AUTO_TEST_CASE( to_lower2 )
212 {
213  std::vector<double> pos = {1,2,3,4,5};
214  std::vector<std::string> data = {"AAAAA",
215  "AAGAA",
216  "CTGAA",
217  "NAACT"};
218 
219  Sequence::PolySites ps(std::move(pos),std::move(data)),
220  ps2(copyPolyTable(ps));
221 
222  for( auto & d : ps )
223  {
224  for( auto & ch : d )
225  {
226  ch = std::tolower(ch);
227  }
228  }
229 
230  //Now, ps and ps2 will not be equal
231  BOOST_REQUIRE( ps != ps2 );
232 
233  for( auto & d : ps )
234  {
235  for( auto & ch : d )
236  {
237  ch = std::toupper(ch);
238  }
239  }
240 
241  BOOST_REQUIRE( ps == ps2 );
242 }
243 
244 //TODO: restore
245 
246 // BOOST_AUTO_TEST_CASE( remove_maf_with_outgroup )
247 // {
248 // std::vector<double> pos = {1,2,3,4,5};
249 // std::vector<std::string> data = {"AAAAA",
250 // "AAGAA",
251 // "AAGAA",
252 // "CAGAA",
253 // "CTGAA",
254 // "NAACT"};
255 
256 // Sequence::PolySites ps(std::move(pos),std::move(data));
257 
258 // //The outgroup is the first sequence
259 // ps.ApplyFreqFilter(2,true,0);
260 // BOOST_REQUIRE_EQUAL( ps.numsites(), 1 );
261 // BOOST_REQUIRE( !ps.empty() );
262 
263 // ps.ApplyFreqFilter(3,true,0);
264 
265 // BOOST_REQUIRE_EQUAL( ps.numsites(), 0 );
266 // BOOST_REQUIRE_EQUAL( ps.size(), 0 );
267 // BOOST_REQUIRE( ps.empty() );
268 // }
269 
270 // BOOST_AUTO_TEST_CASE( remove_maf )
271 // {
272 // std::vector<double> pos = {1,2,3,4,5};
273 // std::vector<std::string> data = {"AAAAA",
274 // "AAGAA",
275 // "CTGAA",
276 // "NAACT"};
277 
278 // Sequence::PolySites ps(std::move(pos),std::move(data));
279 
280 // ps.ApplyFreqFilter(2);
281 
282 // BOOST_REQUIRE_EQUAL( ps.numsites(), 1 );
283 // BOOST_REQUIRE( !ps.empty() );
284 
285 // ps.ApplyFreqFilter(3);
286 
287 // BOOST_REQUIRE_EQUAL( ps.numsites(), 0 );
288 // BOOST_REQUIRE_EQUAL( ps.size(), 0 );
289 // BOOST_REQUIRE( ps.empty() );
290 // }
291 
292 BOOST_AUTO_TEST_CASE( remove_multi )
293 {
294  std::vector<double> pos = {1,2,3,4,5};
295  std::vector<std::string> data = {"TAAAA",
296  "AAGAA",
297  "CTGAA",
298  "TAACT"};
299 
300  Sequence::PolySites ps(std::move(pos),std::move(data));
301 
302  ps=removeMultiHits(ps);
303 
304  BOOST_REQUIRE_EQUAL( ps.numsites(), 4 );
305 }
306 
307 /*
308  Now, we only consider the ingroup for removing multiple
309  hits. In this test, site 1 has 3 states, but we
310  will treat the first sequence as the outgroup. Thus,
311  the ingroup has only 2 states and thus the site
312  won't be filtered.
313 */
314 BOOST_AUTO_TEST_CASE( remove_multi_ingroup )
315 {
316  std::vector<double> pos = {1,2,3,4,5};
317  std::vector<std::string> data = {"CAAAA",
318  "AAGAA",
319  "ATGAA",
320  "TAACT"};
321 
322  Sequence::PolySites ps(std::move(pos),std::move(data));
323 
324  ps=removeMultiHits(ps,true,0);
325 
326  BOOST_REQUIRE_EQUAL( ps.numsites(), 5 );
327 }
328 
329 /*
330  In this case, we'll take the second sequence
331  as the outgroup, and now site 1 will
332  get filtered out
333 */
334 BOOST_AUTO_TEST_CASE( remove_multi_ingroup2 )
335 {
336  std::vector<double> pos = {1,2,3,4,5};
337  std::vector<std::string> data = {"CAAAA",
338  "AAGAA",
339  "ATGAA",
340  "TAACT"};
341 
342  Sequence::PolySites ps(std::move(pos),std::move(data));
343  ps=removeMultiHits(ps,true,1);
344  BOOST_REQUIRE_EQUAL( ps.numsites(), 4 );
345 }
346 
347 BOOST_AUTO_TEST_CASE( remove_ambig )
348 {
349  std::vector<double> pos = {1,2,3,4,5};
350  //Q is not a DNA character.
351  std::vector<std::string> data = {"QAAAA",
352  "AAGAA",
353  "ATGAA",
354  "TAACT"};
355  Sequence::PolySites ps(std::move(pos),std::move(data));
356 
357  ps=removeAmbiguous(ps);
358 
359  BOOST_REQUIRE_EQUAL( ps.numsites(), 4 );
360 }
361 
362 BOOST_AUTO_TEST_CASE( remove_ambig2 )
363 {
364  std::vector<double> pos = {1,2,3,4,5};
365  //Q is not a DNA character.
366  std::vector<std::string> data = {"QAAAA",
367  "AAGAA",
368  "ATGAA",
369  "TAACT"};
370 
371  Sequence::PolySites ps(std::move(pos),std::move(data));
372 
373  //Site will not get removed b/c we don't consider the outgroup
374  ps=removeAmbiguous(ps,true,0);
375 
376  BOOST_REQUIRE_EQUAL( ps.numsites(), 5 );
377 }
378 
379 BOOST_AUTO_TEST_CASE( remove_ambig3 )
380 {
381  std::vector<double> pos = {1,2,3,4,5};
382  //Q is not a DNA character.
383  std::vector<std::string> data = {"QAAAA",
384  "AAGAA",
385  "ATGAA",
386  "TAACT"};
387 
388  Sequence::PolySites ps(std::move(pos),std::move(data));
389 
390  //Site will get removed b/c if we use a different outgroup
391  ps=removeAmbiguous(ps,true,1);
392 
393  BOOST_REQUIRE_EQUAL( ps.numsites(), 4 );
394 }
395 
396 //Test functions in Sequence/PolyTableFunctions.hpp
397 
398 BOOST_AUTO_TEST_CASE( contains_char )
399 {
400  std::vector<double> pos = {1,2,3,4,5};
401  //Q is not a DNA character.
402  std::vector<std::string> data = {"QAAAA",
403  "AAGAA",
404  "ATGAA",
405  "TAACT"};
406 
407  Sequence::PolySites ps(std::move(pos),std::move(data));
408 
409  //test for things in the poly table
410  for( auto c : { 'Q','A','G','T','C' } )
411  {
412  BOOST_REQUIRE_EQUAL( Sequence::containsCharacter(&ps,c), true );
413  }
414 
415  //make sure that it does't think other stuff is there
416  for( auto c : { '-','N','K','?' } )
417  {
418  BOOST_REQUIRE_EQUAL( Sequence::containsCharacter(&ps,c), false );
419  }
420 }
421 
422 //A table is valid if it contains onlt A,G,C,T,N,- as characters
423 BOOST_AUTO_TEST_CASE( is_valid )
424 {
425  std::vector<double> pos = {1,2,3,4,5};
426  //Q is not a DNA character.
427  std::vector<std::string> data = {"QAAAA",
428  "AAGAA",
429  "ATGAA",
430  "TAACT"};
431 
432  Sequence::PolySites ps(std::move(pos),std::move(data));
433 
434  BOOST_REQUIRE_EQUAL( Sequence::polyTableValid(&ps), false );
435 
436  ps=removeAmbiguous(ps);
437 
438  BOOST_REQUIRE_EQUAL( Sequence::polyTableValid(&ps), true );
439 }
440 
441 //TODO: restore
442 
443 // BOOST_AUTO_TEST_CASE( identity_chars )
444 // {
445 // std::vector<double> pos = {1,2,3,4,5};
446 // //Q is not a DNA character.
447 // std::vector<std::string> data = {"AAAAA",
448 // "AAGAA",
449 // "ATGAA",
450 // "TAACT"};
451 // Sequence::PolySites ps(std::move(pos),std::move(data)),
452 // ps2(ps);
453 
454 // //Fill in a K where seqs 1-3 match seq 0
455 // Sequence::addIdentityChar(&ps2,0,'K');
456 
457 // BOOST_REQUIRE_EQUAL( Sequence::containsCharacter(&ps2,'K'), true );
458 
459 // //Reverse what we just did
460 
461 // Sequence::fillIn(&ps2,0,'K');
462 
463 // BOOST_REQUIRE_EQUAL( Sequence::containsCharacter(&ps2,'K'), false );
464 // BOOST_REQUIRE( ps==ps2 );
465 // }
466 
467 BOOST_AUTO_TEST_CASE( remove_gaps )
468 {
469  std::vector<double> pos = {1,2,3,4,5};
470  //Q is not a DNA character.
471  std::vector<std::string> data = {"-AAAA",
472  "AAGAA",
473  "ATGAA",
474  "TAACT"};
475 
476  Sequence::PolySites ps(std::move(pos),std::move(data));
477 
478  ps=Sequence::removeGaps(ps);
479 
480  BOOST_REQUIRE_EQUAL( ps.numsites(), 4 );
481 }
482 
483 BOOST_AUTO_TEST_CASE( remove_invariant )
484 {
485  std::vector<double> pos = {1,2,3,4,5};
486  //Q is not a DNA character.
487  std::vector<std::string> data = {"AAAAA",
488  "AAGAA",
489  "ATGAA",
490  "TAACT"};
491 
492  Sequence::PolySites ps(std::move(pos),std::move(data));
493 
494  //This will remove nothing
495  ps =Sequence::removeInvariantPos(ps);
496 
497  BOOST_REQUIRE_EQUAL( ps.numsites(), 5 );
498 
499  //This will remove sites 0,3,4
500  ps=Sequence::removeInvariantPos(ps,true,3);
501  BOOST_REQUIRE_EQUAL( ps.numsites(), 2 );
502 }
503 
504 
505 BOOST_AUTO_TEST_SUITE_END()
506 
bool polyTableValid(const PolyTable *t) __attribute__((deprecated))
Polymorphism tables for sequence data.
Definition: PolySites.hpp:33
The namespace in which this library resides.
bool containsCharacter(const PolyTable *t, const char ch) __attribute__((deprecated))
Site-major variation tables in ASCII format.
Sequence::PolySites, generates polymorphism tables from data.
Declaration of Sequence::Fasta streams.
Operations on non-const Sequence::PolyTable objects.