Presentation
This page presents some code snippets related to the use of k-mer API.
Some of the snippets presented below can be used online.
Additional snippets are available in directory: gatb-core/gatb-core/examples/kmer.
Using a kmer model
This snippet shows how to create kmer models.
Code is from example kmer1.cpp:
int main (int argc, char* argv[])
{
Kmer<>::ModelCanonical model;
std::cout << "---------- MODEL ----------" << std::endl;
std::cout << "span: " << model.getSpan() << std::endl;
std::cout << "kmer size: " << model.getKmerSize() << std::endl;
std::cout << "kmer memory size: " << model.getMemorySize() << std::endl;
}
[go back to top]
Computing kmers with a model
This snippet shows how to get kmers from a model. Here, we can see that we can use 3 different kinds of models, giving different kinds of kmers:
- ModelDirect : direct kmers
- ModelCanonical : minimum value of the direct kmer and its reverse complement
- ModelMinimizer : provides also a minimizer for the kmer
The snippet shows different methods usable for each kind of model.
Code is from example kmer2.cpp:
int main (int argc, char* argv[])
{
#define KSIZE_32 (KSIZE_LIST == 32)
std::cout << "this example cannot be run when gatb-core is compiled for kmers <= 32" << std::endl;
#if KSIZE_32
#else
const char* seq = argc<2 ? "CTACGAATT" : argv[1];
std::cout << std::endl << "INITIAL STRING IS '" << seq << "'" << std::endl;
size_t kmerSize = strlen(seq);
if (kmerSize >= span)
{
std::cerr << "STRING TOO BIG (" << kmerSize << "), must be less than " << span << std::endl;
return EXIT_FAILURE;
}
typedef Kmer<span>::ModelDirect ModelDirect;
typedef Kmer<span>::ModelCanonical ModelCanonical;
typedef Kmer<span>::ModelMinimizer<ModelCanonical> ModelMinimizer;
{
ModelDirect model (kmerSize);
ModelDirect::Kmer kmer = model.codeSeed (seq, Data::ASCII);
std::cout << std::endl;
std::cout << "-------------------- DIRECT --------------------" << std::endl;
std::cout << "kmer value is: " << kmer.value() << std::endl;
std::cout << "kmer string is: " << model.toString(kmer.value()) << std::endl;
}
{
ModelCanonical model (kmerSize);
ModelCanonical::Kmer kmer = model.codeSeed (seq, Data::ASCII);
std::cout << std::endl;
std::cout << "-------------------- CANONICAL --------------------" << std::endl;
std::cout << "kmer value is: " << kmer.value() << std::endl;
std::cout << "kmer string is: " << model.toString(kmer.value()) << std::endl;
std::cout << "forward value is: " << kmer.forward() << std::endl;
std::cout << "forward string is: " << model.toString(kmer.forward()) << std::endl;
std::cout << "revcomp value is: " << kmer.revcomp() << std::endl;
std::cout << "revcomp string is: " << model.toString(kmer.revcomp()) << std::endl;
std::cout << "used strand is : " << toString(kmer.strand()) << std::endl;
}
{
ModelMinimizer model (kmerSize, 8);
const ModelCanonical& modelMinimizer = model.getMmersModel();
ModelMinimizer::Kmer kmer = model.codeSeed (seq, Data::ASCII);
std::cout << std::endl;
std::cout << "-------------------- MINIMIZER --------------------" << std::endl;
std::cout << "kmer value is: " << kmer.value() << std::endl;
std::cout << "kmer string is: " << model.toString(kmer.value()) << std::endl;
std::cout << "forward value is: " << kmer.forward() << std::endl;
std::cout << "forward string is: " << model.toString(kmer.forward()) << std::endl;
std::cout << "revcomp value is: " << kmer.revcomp() << std::endl;
std::cout << "revcomp string is: " << model.toString(kmer.revcomp()) << std::endl;
std::cout << "used strand is : " << toString(kmer.strand()) << std::endl;
std::cout << "minimizer model size : " << modelMinimizer.getKmerSize() << std::endl;
std::cout << "minimizer value is : " << kmer.minimizer().value() << std::endl;
std::cout << "minimizer string is : " << modelMinimizer.toString(kmer.minimizer().value()) << std::endl;
std::cout << "minimizer position in kmer : " << kmer.position() << std::endl;
std::cout << "minimizer changed : " << kmer.hasChanged() << std::endl;
}
#endif
}
[go back to top]
Iterating kmers from a sequence
This snippet shows how to iterate the kmers from a sequence, for a given model.
Code is from example kmer3.cpp:
#include <iostream>
int main (int argc, char* argv[])
{
const char* seq = "CATTGATAGTGGATGGT";
std::cout << "Initial sequence: " << seq << std::endl;
Data data ((char*)seq);
Kmer<>::ModelDirect model (5);
Kmer<>::ModelDirect::Iterator it (model);
it.setData (data);
for (it.first(); !it.isDone(); it.next())
{
std::cout << "kmer " << model.toString(it->value()) << ", value " << it->value() << std::endl;
}
}
[go back to top]
Iterating kmers from one or several banks
This snippet shows how to iterate the kmers from banks. In particular, we use two iterators and two loops:
- outer loop on sequences of the bank
- inner loop on kmer on the current sequence from the outer loop
Code is from example kmer4.cpp:
#include <iostream>
#include <string.h>
int main (int argc, char* argv[])
{
OptionsParser parser ("KmerTest");
parser.push_back (new OptionOneParam (STR_URI_INPUT, "bank input", true));
parser.push_back (new OptionOneParam (STR_KMER_SIZE, "kmer size", true));
parser.push_back (new OptionNoParam (STR_VERBOSE, "display kmers", false));
try
{
IProperties* options = parser.parse (argc, argv);
size_t kmerSize = options->getInt(STR_KMER_SIZE);
bool verbose = options->get(STR_VERBOSE) != 0;
u_int64_t nbSequences = 0;
u_int64_t nbKmers = 0;
IBank* bank = Bank::open (options->getStr(STR_URI_INPUT));
Kmer<span>::ModelDirect model (kmerSize);
Kmer<span>::ModelDirect::Iterator itKmer (model);
ProgressIterator<Sequence> itSeq (*bank);
for (itSeq.first(); !itSeq.isDone(); itSeq.next())
{
itKmer.setData (itSeq->getData());
for (itKmer.first(); !itKmer.isDone(); itKmer.next())
{
if (verbose) { cout << model.toString (itKmer->value()) << endl; }
nbKmers++;
}
nbSequences++;
}
cout << "FOUND " << nbKmers << " kmers in " << nbSequences << " sequences" << endl;
}
catch (OptionFailure& e)
{
return e.displayErrors (std::cout);
}
catch (Exception& e)
{
std::cerr << "EXCEPTION: " << e.getMessage() << std::endl;
}
return EXIT_SUCCESS;
}
[go back to top]
Computing statistics about minimizers
This snippet shows iterate the kmers of an input bank and computes some statistics about the iterated minimizers.
It also computes the following distribution : number of times a read has X different minimizers (in other words, the number of super kmers).
Code is from example kmer5.cpp:
#include <iostream>
#include <fstream>
#include <string.h>
typedef Kmer<span>::ModelDirect ModelDirect;
typedef Kmer<span>::ModelMinimizer<ModelDirect> ModelMinimizer;
int main (int argc, char* argv[])
{
static const char* STR_URI_DISTRIB = "-distrib";
OptionsParser parser ("KmerTest");
parser.push_back (new OptionOneParam (STR_URI_INPUT, "bank input", true));
parser.push_back (new OptionOneParam (STR_KMER_SIZE, "kmer size", true));
parser.push_back (new OptionOneParam (STR_MINIMIZER_SIZE, "minimizer size", true));
parser.push_back (new OptionNoParam (STR_URI_DISTRIB, "compute distribution: number of times a read has X superkmers", false));
parser.push_back (new OptionNoParam (STR_VERBOSE, "display kmers", false));
try
{
IProperties* options = parser.parse (argc, argv);
string bankFilename = options->getStr(STR_URI_INPUT);
size_t kmerSize = options->getInt(STR_KMER_SIZE);
size_t mmerSize = options->getInt(STR_MINIMIZER_SIZE);
u_int64_t nbSequences = 0;
u_int64_t nbKmers = 0;
u_int64_t nbMinimizers = 0;
u_int64_t nbMinimizers2 = 0;
bool display = options->get(STR_VERBOSE) != 0;
IBank* bank = Bank::open (bankFilename);
ModelMinimizer model (kmerSize, mmerSize);
const ModelDirect& modelMinimizer = model.getMmersModel();
map<size_t,size_t> distribSuperKmers;
ProgressIterator<Sequence> itSeq (*bank);
for (itSeq.first(); !itSeq.isDone(); itSeq.next())
{
size_t nbMinimizersPerRead = 0;
model.iterate (itSeq->getData(), [&] (const ModelMinimizer::Kmer& kmer, size_t idx)
{
if (display)
{
std::cout << "KMER=" << model.toString(kmer.value()) << " "
<< (kmer.hasChanged() ? "NEW" : "OLD") << " "
<< "MINIMIZER=" << modelMinimizer.toString(kmer.minimizer().value()) << " "
<< "at position " << kmer.position()
<< std::endl;
}
if (kmer.hasChanged()==true) { nbMinimizersPerRead++; }
nbKmers ++;
});
distribSuperKmers [nbMinimizersPerRead]++;
nbSequences ++;
nbMinimizers += nbMinimizersPerRead;
nbMinimizers2 += nbMinimizersPerRead*nbMinimizersPerRead;
}
double mean = nbSequences>0 ? (double)nbMinimizers / (double) nbSequences : 0;
double mean2 = nbSequences>0 ? (double)nbMinimizers2 / (double) nbSequences : 0;
double devia = sqrt (mean2 - mean*mean);
Properties info;
info.add (0, "info");
info.add (1, "bank", "%s", bankFilename.c_str());
info.add (1, "kmer_size", "%ld", kmerSize);
info.add (1, "mmer_size", "%ld", mmerSize);
info.add (1, "nb_sequences", "%ld", nbSequences);
info.add (1, "nb_kmers", "%ld", nbKmers);
info.add (1, "nb_minimizers", "%ld", nbMinimizers);
info.add (1, "mean", "%.2f", mean);
info.add (1, "deviation", "%.2f", devia);
cout << info << endl;
if (options->get(STR_URI_DISTRIB))
{
string outputFilename = System::file().getBaseName(bankFilename) + string (".distrib");
FILE* output = fopen (outputFilename.c_str(), "w");
if (output)
{
for (map<size_t,size_t>::iterator it = distribSuperKmers.begin(); it != distribSuperKmers.end(); ++it)
{
fprintf (output, "%ld %ld\n", it->first, it->second);
}
fclose (output);
}
}
}
catch (OptionFailure& e)
{
return e.displayErrors (std::cout);
}
catch (Exception& e)
{
std::cerr << "EXCEPTION: " << e.getMessage() << std::endl;
}
return EXIT_SUCCESS;
}
[go back to top]
Checking span of kmers model
This snippet shows what is legal in terms of kmers size.
Actually, a kmers model has a type with a 'span' template. This span represents the maximum kmer size reachable for this type. For instance, a span of 32 allows up to kmer size of 31, a span of 64 up to 63, etc...
The 'span' value must be one of the project defined constants: KSIZE_1, KSIZE_2, KSIZE_3 and KSIZE_4. By default, we have KSIZE_n = 32*n. Note that it is possible to compile GATB-CORE with a bigger value of KSIZE_4=128. See Instructions to compile GATB-Core.
It is important to understand that each one of the four span values defines a specific kmers model with a specific integer type that represents the kmers values. For instance:
- KSIZE_1=32 implies that we need 64 bits integers (2 bits per nucleotide), which is available as a native type on 64 bits architecture
- KSIZE_2=64 implies that we need 128 bits integers which may (or not) be available as native integer type
- for KSIZE_3 and KSIZE_4, we need to switch to specific large integer representations that are no more native on the system, which implies bigger computation times.
Code is from example kmer6.cpp:
int main (int argc, char* argv[])
{
try
{
Kmer<32>::ModelCanonical model (51);
}
catch (Exception& e)
{
std::cout << "EXCEPTION: " << e.getMessage() << std::endl;
}
try
{
Kmer<64>::ModelCanonical model (51);
std::cout << "It's OK..." << std::endl;
}
catch (Exception& e)
{
std::cout << "EXCEPTION: " << e.getMessage() << std::endl;
}
return EXIT_SUCCESS;
}
[go back to top]
Other kind of statistics about minimizers
This snippet shows how to iterate the kmers of an input bank and computes some statistics about the iterated minimizers.
Code is from example kmer7.cpp:
#include <vector>
#include <memory>
int main (int argc, char* argv[])
{
OptionsParser parser ("KmerStats");
parser.push_back (new OptionOneParam (STR_URI_INPUT, "bank reference", true));
parser.push_back (new OptionOneParam (STR_KMER_SIZE, "kmer size", true));
try
{
IProperties* options = parser.parse (argc, argv);
size_t kmerSize = options->getInt (STR_KMER_SIZE);
size_t nbKmers = 1 << (2*kmerSize);
Kmer<>::ModelDirect model (kmerSize);
IBank* bank = Bank::open (options->getStr(STR_URI_INPUT));
vector<NativeInt64> distrib (nbKmers);
size_t totalKmers = 0;
ProgressIterator<Sequence> iter (*bank, "iterate bank");
for (iter.first(); !iter.isDone(); iter.next())
{
Sequence& seq = iter.item();
size_t len = seq.getDataSize() / kmerSize;
char* data = seq.getDataBuffer();
for (size_t i=0; i<len; i++, data += kmerSize)
{
Kmer<>::ModelDirect::Kmer kmer = model.codeSeed (data, seq.getDataEncoding());
distrib [kmer.value().toInt()] += 1;
}
}
std::sort (distrib.begin(), distrib.end());
auto_ptr<Storage> storage (StorageFactory(STORAGE_HDF5).create ("output", true, false));
Collection<NativeInt64>& distribCollect = storage->getGroup("distrib").getCollection<NativeInt64> (options->getStr (STR_KMER_SIZE));
distribCollect.addProperty ("name", options->getStr (STR_URI_INPUT));
distribCollect.addProperty ("kmer_size", options->getStr (STR_KMER_SIZE));
distribCollect.insert (distrib);
}
catch (OptionFailure& e)
{
return e.displayErrors (cout);
}
catch (Exception& e)
{
std::cout << "EXCEPTION: " << e.getMessage() << std::endl;
}
return EXIT_SUCCESS;
}
[go back to top]
Setting custom minimizers definition
This snippet shows how to configure custom minimizer definition through a functor.
Code is from example kmer8.cpp:
#include <iostream>
#include <fstream>
#include <string.h>
struct CustomMinimizer
{
template<class Model> void init (const Model& model, Kmer<span>::Type& optimum) const
{
optimum = model.getKmerMax();
}
bool operator() (const Kmer<span>::Type& current, const Kmer<span>::Type& optimum) const
{
return current < optimum;
}
void include_frequency (uint32_t* freq_order) {}
};
struct CustomMaximizer
{
template<class Model> void init (const Model& model, Kmer<span>::Type& optimum) const
{
optimum.setVal(0);
}
bool operator() (const Kmer<span>::Type& current, const Kmer<span>::Type& optimum) const
{
return !(current < optimum);
}
void include_frequency (uint32_t* freq_order) {}
};
typedef Kmer<span>::ModelDirect ModelDirect;
typedef Kmer<span>::ModelMinimizer<ModelDirect,CustomMinimizer> ModelMinimizer;
typedef Kmer<span>::ModelMinimizer<ModelDirect,CustomMaximizer> ModelMaximizer;
typedef ModelMinimizer Model;
int main (int argc, char* argv[])
{
OptionsParser parser ("KmerTest");
parser.push_back (new OptionOneParam (STR_URI_INPUT, "bank input", true));
parser.push_back (new OptionOneParam (STR_KMER_SIZE, "kmer size", true));
parser.push_back (new OptionOneParam (STR_MINIMIZER_SIZE, "minimizer size", true));
parser.push_back (new OptionNoParam (STR_VERBOSE, "display kmers", false));
try
{
IProperties* options = parser.parse (argc, argv);
size_t kmerSize = options->getInt(STR_KMER_SIZE);
size_t mmerSize = options->getInt(STR_MINIMIZER_SIZE);
u_int64_t nbKmers = 0;
bool display = options->get(STR_VERBOSE) != 0;
IBank* bank = Bank::open (options->getStr(STR_URI_INPUT));
Model model (kmerSize, mmerSize);
const ModelMinimizer::Model& modelMinimizer = model.getMmersModel();
Kmer<span>::Type checksum;
size_t nbChanged = 0;
size_t nbInvalid = 0;
ProgressIterator<Sequence> iter (*bank, "iterate bank");
for (iter.first(); !iter.isDone(); iter.next())
{
Sequence& seq = iter.item();
model.iterate (seq.getData(), [&] (const Model::Kmer& kmer, size_t idx)
{
nbKmers ++;
if (kmer.hasChanged() == true) { nbChanged++; }
if (kmer.isValid() == false) { nbInvalid++; }
checksum += kmer.minimizer().value();
});
}
cout << "nbKmers : " << nbKmers << endl;
cout << "nbInvalid : " << nbInvalid << endl;
cout << "nbChanged : " << nbChanged << endl;
cout << "ratio : " << (nbChanged > 0 ? (double)nbKmers / (double)nbChanged : 0) << endl;
cout << "checksum : " << checksum << endl;
}
catch (OptionFailure& e)
{
return e.displayErrors (std::cout);
}
catch (Exception& e)
{
std::cerr << "EXCEPTION: " << e.getMessage() << std::endl;
}
return EXIT_SUCCESS;
}
[go back to top]
Counting kmers from a bank with the sorting count algorithm
This snippet shows how to use the SortingCountAlgorithm class for counting kmers in a bank
Code is from example kmer9.cpp:
int main (int argc, char* argv[])
{
IOptionsParser* parser = SortingCountAlgorithm<>::getOptionsParser();
try
{
IProperties* options = parser->parse (argc, argv);
IBank* bank = Bank::open (options->getStr(STR_URI_INPUT));
SortingCountAlgorithm<> algo (bank, options);
algo.execute();
cout << *algo.getInfo();
}
catch (OptionFailure& e)
{
return e.displayErrors (std::cout);
}
catch (Exception& e)
{
std::cerr << "EXCEPTION: " << e.getMessage() << std::endl;
}
return EXIT_SUCCESS;
}
[go back to top]
Reading a file generated by the sorting count algorithm
This snippet shows how to read the output of the SortingCountAlgorithm.
We use two ways for reading the couples [kmer,abundance]
- we read all the couples with a single iterator
- we read each solid collection and use an iterator on it
Code is from example kmer10.cpp:
int main (int argc, char* argv[])
{
OptionsParser parser ("SortingCount");
parser.push_back (new OptionOneParam (STR_URI_INPUT, "sorting count input", true));
try
{
typedef Kmer<>::Count Count;
typedef Kmer<>::Type Type;
IProperties* options = parser.parse (argc, argv);
Storage* storage = StorageFactory(STORAGE_HDF5).load (options->getStr(STR_URI_INPUT));
LOCAL (storage);
Group& dskGroup = storage->getGroup("dsk");
Partition<Count>& solidKmers = dskGroup.getPartition<Count> ("solid");
Type checksum1, checksum2;
Iterator<Count>* it = solidKmers.iterator();
LOCAL (it);
for (it->first(); !it->isDone(); it->next()) { checksum1 = checksum1 + it->item().value; }
for (size_t i=0; i<solidKmers.size(); i++)
{
Collection<Count>& collection = solidKmers [i];
Iterator<Count>* it = collection.iterator();
LOCAL (it);
for (it->first(); !it->isDone(); it->next()) { checksum2 = checksum2 + it->item().value; }
}
cout << "checksum1=" << checksum1 << endl;
cout << "checksum2=" << checksum1 << endl;
}
catch (OptionFailure& e)
{
return e.displayErrors (std::cout);
}
catch (Exception& e)
{
std::cerr << "EXCEPTION: " << e.getMessage() << std::endl;
}
return EXIT_SUCCESS;
}
[go back to top]