Needle
An application for fast and efficient searches of NGS data.
|
#include <iostream>
#include <math.h>
#include <numeric>
#include <string>
#include <seqan3/alphabet/container/concatenated_sequences.hpp>
#include <seqan3/alphabet/nucleotide/dna4.hpp>
#include <filesystem>
#include "shared.h"
Go to the source code of this file.
Classes | |
struct | minimiser_arguments |
struct | RandomGenerator |
Generates a random integer not greater than a given maximum. More... | |
Functions | |
void | count (min_arguments const &args, std::vector< std::filesystem::path > sequence_files, std::filesystem::path include_file, std::filesystem::path genome_file, bool paired) |
Get the concrete expression values (= median of all counts of one transcript) for given experiments. This function can be used to estimate how good the median approach can be, if all count values are available. | |
void | count_genome (min_arguments const &args, std::filesystem::path include_file, std::filesystem::path exclude_file) |
Creates a set of minimizers to ignore, which should be used as an input to count. | |
void | read_binary (std::filesystem::path filename, robin_hood::unordered_node_map< uint64_t, uint16_t > &hash_table) |
Reads a binary file that needle minimiser creates. | |
void | read_binary_start (min_arguments &args, std::filesystem::path filename, uint64_t &num_of_minimisers, uint8_t &cutoff) |
Reads the beginning of a binary file that needle minimiser creates. | |
std::vector< uint16_t > | ibf (std::vector< std::filesystem::path > const &sequence_files, estimate_ibf_arguments &ibf_args, minimiser_arguments &minimiser_args, std::vector< double > &fpr, std::vector< uint8_t > &cutoffs, std::filesystem::path const expression_by_genome_file="", size_t num_hash=1) |
Creates IBFs. | |
std::vector< uint16_t > | ibf (std::vector< std::filesystem::path > const &minimiser_files, estimate_ibf_arguments &ibf_args, std::vector< double > &fpr, std::filesystem::path const expression_by_genome_file="", size_t num_hash=1) |
Creates IBFs based on the minimiser files. | |
void | minimiser (std::vector< std::filesystem::path > const &sequence_files, min_arguments const &args, minimiser_arguments &minimiser_args, std::vector< uint8_t > &cutoffs) |
Create minimiser and header files. | |
std::vector< uint16_t > | insert (std::vector< std::filesystem::path > const &sequence_files, estimate_ibf_arguments &ibf_args, minimiser_arguments &minimiser_args, std::vector< uint8_t > &cutoffs, std::filesystem::path const expression_by_genome_file, std::filesystem::path path_in, bool samplewise) |
Insert into IBFs. | |
std::vector< uint16_t > | insert (std::vector< std::filesystem::path > const &minimiser_files, estimate_ibf_arguments &ibf_args, std::filesystem::path const expression_by_genome_file, std::filesystem::path path_in, bool samplewise) |
Insert into IBFs based on the minimiser files. | |
void | delete_bin (std::vector< uint64_t > const &delete_files, estimate_ibf_arguments &ibf_args, std::filesystem::path path_in, bool samplewise) |
Delete bins from ibfs. | |
void count | ( | min_arguments const & | args, |
std::vector< std::filesystem::path > | sequence_files, | ||
std::filesystem::path | include_file, | ||
std::filesystem::path | genome_file, | ||
bool | paired | ||
) |
Get the concrete expression values (= median of all counts of one transcript) for given experiments. This function can be used to estimate how good the median approach can be, if all count values are available.
args | The minimiser arguments to use (seed, shape, window size). |
sequence_files | The sequence files, which contains the reads. |
include_file | A file containing the transcripts which expression values should be determined. |
genome_file | A "*.genome" file constructed with the command genome. |
paired | Flag to indicate if input data is paired or not. |
void count_genome | ( | min_arguments const & | args, |
std::filesystem::path | include_file, | ||
std::filesystem::path | exclude_file | ||
) |
Creates a set of minimizers to ignore, which should be used as an input to count.
args | The minimiser arguments to use (seed, shape, window size). |
include_file | A file containing the transcripts which expression values should be determined. |
exclude_file | A file containing minimizers which should be ignored. |
void delete_bin | ( | std::vector< uint64_t > const & | delete_files, |
estimate_ibf_arguments & | ibf_args, | ||
std::filesystem::path | path_in, | ||
bool | samplewise | ||
) |
Delete bins from ibfs.
delete_files | A vector of integers specifiying the bins to delete. |
ibf_args | The IBF specific arguments to use (bin size, number of hash functions, ...). See struct ibf_arguments. |
path_in | Input directory. |
samplewise | True, if expression levels were set beforehand. |
std::vector< uint16_t > ibf | ( | std::vector< std::filesystem::path > const & | minimiser_files, |
estimate_ibf_arguments & | ibf_args, | ||
std::vector< double > & | fpr, | ||
std::filesystem::path const | expression_by_genome_file = "" , |
||
size_t | num_hash = 1 |
||
) |
Creates IBFs based on the minimiser files.
minimiser_files | A vector of minimiser file paths. |
ibf_args | The IBF specific arguments to use (bin size, number of hash functions, ...). See struct ibf_arguments. |
fpr | The average false positive rate that should be used. |
expression_by_genome_file | File that contains the only minimisers that should be comnsidered for the determination of the expression_thresholds. |
num_hash | The number of hash functions to use. |
std::vector< uint16_t > ibf | ( | std::vector< std::filesystem::path > const & | sequence_files, |
estimate_ibf_arguments & | ibf_args, | ||
minimiser_arguments & | minimiser_args, | ||
std::vector< double > & | fpr, | ||
std::vector< uint8_t > & | cutoffs, | ||
std::filesystem::path const | expression_by_genome_file = "" , |
||
size_t | num_hash = 1 |
||
) |
Creates IBFs.
sequence_files | A vector of sequence file paths. |
ibf_args | The IBF specific arguments to use (bin size, number of hash functions, ...). See struct ibf_arguments. |
minimiser_args | The minimiser specific arguments to use. |
fpr | The average false positive rate that should be used. |
cutoffs | List of cutoffs. |
expression_by_genome_file | File that contains the only minimisers that should be considered for the determination of the expression thresholds. |
num_hash | The number of hash functions to use. |
std::vector< uint16_t > insert | ( | std::vector< std::filesystem::path > const & | minimiser_files, |
estimate_ibf_arguments & | ibf_args, | ||
std::filesystem::path const | expression_by_genome_file, | ||
std::filesystem::path | path_in, | ||
bool | samplewise | ||
) |
Insert into IBFs based on the minimiser files.
minimiser_files | A vector of minimiser file paths. |
ibf_args | The IBF specific arguments to use (bin size, number of hash functions, ...). See struct ibf_arguments. |
expression_by_genome_file | File that contains the only minimisers that should be comnsidered for the determination of the expression_thresholds. |
path_in | Input directory. |
samplewise | True, if expression levels were set beforehand. |
std::vector< uint16_t > insert | ( | std::vector< std::filesystem::path > const & | sequence_files, |
estimate_ibf_arguments & | ibf_args, | ||
minimiser_arguments & | minimiser_args, | ||
std::vector< uint8_t > & | cutoffs, | ||
std::filesystem::path const | expression_by_genome_file, | ||
std::filesystem::path | path_in, | ||
bool | samplewise | ||
) |
Insert into IBFs.
sequence_files | A vector of sequence file paths. |
ibf_args | The IBF specific arguments to use (bin size, number of hash functions, ...). See struct ibf_arguments. |
minimiser_args | The minimiser specific arguments to use. |
cutoffs | List of cutoffs. |
expression_by_genome_file | File that contains the only minimisers that should be considered for the determination of the expression thresholds. |
path_in | Input directory. |
samplewise | True, if expression levels were set beforehand. |
void minimiser | ( | std::vector< std::filesystem::path > const & | sequence_files, |
min_arguments const & | args, | ||
minimiser_arguments & | minimiser_args, | ||
std::vector< uint8_t > & | cutoffs | ||
) |
Create minimiser and header files.
sequence_files | A vector of sequence file paths. |
args | The minimiser arguments to use (seed, shape, window size). |
minimiser_args | The minimiser specific arguments to use. |
cutoffs | List of cutoffs. |
void read_binary | ( | std::filesystem::path | filename, |
robin_hood::unordered_node_map< uint64_t, uint16_t > & | hash_table | ||
) |
Reads a binary file that needle minimiser creates.
filename | The filename of the binary file. |
hash_table | The hash table to store minimisers into. |
void read_binary_start | ( | min_arguments & | args, |
std::filesystem::path | filename, | ||
uint64_t & | num_of_minimisers, | ||
uint8_t & | cutoff | ||
) |
Reads the beginning of a binary file that needle minimiser creates.
args | Min arguments. |
filename | The filename of the binary file. |
num_of_minimisers | Variable, where to number of minimisers should be stored. |
cutoff | cutoff value. |