DFTracer Bloom Filter Indexing¶
Namespace: dftracer::utils::utilities::composites::dft::indexing
For usage guide and examples, see DFTracer Indexing System.
-
class BloomFilter¶
Split block Bloom filter for approximate set membership testing.
Implements the split block Bloom filter from the Apache Parquet spec: 256-bit blocks of 8 x uint32 words; each insert/query touches exactly one block (one cache line) and sets/tests one bit in each of the 8 words via a fixed SALT array. Block selection uses Lemire’s reduction on h1; in-block masks use h2 multiplied by SALT.
References:
Apple, J. “Split block Bloom filters.” arXiv:2101.01719 (2021).
Putze, F., Sanders, P., Singler, J. “Cache-, hash-, and space-
efficient bloom filters.” ACM JEA 14, Article 4 (2009).
Apache Parquet Bloom filter spec: https://github.com/apache/parquet-format/blob/master/BloomFilter.md
Differs from canonical Parquet:
Underlying hash is FNV1a + SplitMix64 finisher (not xxhash64).
Custom 12-byte LE header (num_hashes, num_entries, num_bits) instead of Thrift; num_hashes is unused at insert/test (vestigial).
Serialization format (self-describing): [4 bytes: num_hashes (uint32_t LE)] [4 bytes: num_entries (uint32_t LE)] [4 bytes: num_bits (uint32_t LE)] [remaining: bit array bytes]
Public Functions
-
explicit BloomFilter(std::size_t expected_entries = 1024, double false_positive_rate = 0.01)¶
-
void add(std::string_view value)¶
-
bool possibly_contains(std::string_view value) const¶
-
void merge_from(const BloomFilter &other)¶
-
std::vector<unsigned char> serialize() const¶
-
void serialize_into(std::vector<unsigned char> &result) const¶
-
inline std::size_t num_entries() const¶
-
inline std::size_t size_bytes() const¶
-
inline std::size_t num_hash_functions() const¶
-
inline std::size_t num_bits() const¶
Public Static Functions
-
static BloomFilter from_blob(const unsigned char *data, std::size_t size)¶
-
class BloomFilterCache¶
Thread-safe bounded cache for deserialized bloom filters. Keyed by (index_path, dimension, checkpoint_idx) for chunk blooms, or (index_path, dimension, UINT64_MAX) for file-level blooms. When the cache exceeds max_entries, it is cleared entirely.
Public Functions
-
inline explicit BloomFilterCache(std::size_t max_entries = DEFAULT_MAX_ENTRIES)¶
-
inline std::optional<BloomFilter> get(const std::string &index_path, const std::string &dimension, std::uint64_t checkpoint_idx) const¶
Look up a cached bloom filter. Returns nullopt on miss.
-
inline void put(const std::string &index_path, const std::string &dimension, std::uint64_t checkpoint_idx, const BloomFilter &bloom)¶
Insert a bloom filter into the cache. Evicts all entries if full.
-
inline std::size_t size() const¶
-
inline explicit BloomFilterCache(std::size_t max_entries = DEFAULT_MAX_ENTRIES)¶
-
struct ChunkDimensionStats¶
Per-dimension per-chunk metadata for query optimization.
Public Functions
-
ChunkDimensionStats() = default¶
-
inline ChunkDimensionStats(const ChunkDimensionStats &other)¶
-
inline ChunkDimensionStats(ChunkDimensionStats &&other) noexcept¶
-
inline ChunkDimensionStats &operator=(const ChunkDimensionStats &other)¶
-
inline ChunkDimensionStats &operator=(ChunkDimensionStats &&other) noexcept¶
-
void observe(std::string_view value)¶
Record a value observation. Updates min/max, distinct_count, value_counts.
-
void observe_range_only(std::uint64_t value)¶
-
std::vector<std::uint8_t> serialize_value_counts() const¶
Serialize value_counts to binary format: [u32 LE num_entries] [u16 LE key_len, key bytes, u64 LE count]*
-
std::optional<std::vector<std::uint8_t>> compress_value_counts(std::size_t cap_bytes = 4096) const¶
Compress serialized value_counts with zlib. Returns nullopt if compressed size exceeds cap_bytes.
Public Members
-
std::string dimension¶
Dimension name (e.g., “cat”, “name”).
-
std::uint64_t distinct_count = 0¶
Number of unique values.
-
std::string min_value¶
Minimum value (numeric-aware for uint/int/double).
-
std::string max_value¶
Maximum value.
-
std::string value_type = "string"¶
“string”, “uint”, “int”, or “double”.
-
std::optional<dftracer::utils::StringViewMap<std::uint64_t>> value_counts¶
-
const std::string *last_key_ = nullptr¶
-
std::uint64_t *last_counter_ = nullptr¶
Public Static Functions
-
static dftracer::utils::StringViewMap<std::uint64_t> deserialize_value_counts(const std::uint8_t *data, std::size_t len)¶
-
static dftracer::utils::StringViewMap<std::uint64_t> decompress_value_counts(const std::uint8_t *data, std::size_t len)¶
Decompress zlib-compressed value_counts, then deserialize.
-
ChunkDimensionStats() = default¶
-
struct ChunkDimensionStatsResult¶
Result type for querying chunk_dimension_stats from the shared index DB.
Public Functions
-
inline bool has_value_counts_payload() const¶
-
inline void ensure_value_counts_decoded() const¶
Public Members
-
std::uint64_t checkpoint_idx¶
-
std::string dimension¶
-
std::uint64_t distinct_count¶
-
std::string min_value¶
-
std::string max_value¶
-
std::string value_type¶
-
mutable std::optional<dftracer::utils::StringViewMap<std::uint64_t>> value_counts¶
-
mutable std::vector<std::uint8_t> compressed_value_counts¶
-
inline bool has_value_counts_payload() const¶
-
struct ChunkIndexState¶
Public Members
-
std::uint64_t checkpoint_idx = 0¶
-
std::size_t events_processed = 0¶
-
IndexedDimensions indexed_dims¶
-
HashResolutions hash_resolutions¶
-
ChunkStatistics statistics¶
-
std::size_t config_hash = 0¶
-
std::uint64_t checkpoint_idx = 0¶
-
struct ChunkIndexerConfig¶
Public Functions
-
inline std::size_t compute_hash() const¶
Public Members
-
bool index_name = true¶
-
bool index_cat = true¶
-
bool index_pid = true¶
-
bool index_tid = true¶
-
bool index_hhash = true¶
-
bool index_fhash = true¶
-
bool index_shash = true¶
-
std::vector<std::string> extra_dimensions¶
-
std::size_t expected_entries_per_chunk = 1024¶
-
double false_positive_rate = 0.01¶
-
bool build_manifest = false¶
-
std::size_t value_counts_cap = 4096¶
-
inline std::size_t compute_hash() const¶
-
struct ChunkIndexerInput¶
Public Functions
-
inline ChunkIndexerInput &with_file_path(const std::string &path)¶
-
inline ChunkIndexerInput &with_index_path(const std::string &path)¶
-
inline ChunkIndexerInput &with_checkpoint_size(std::size_t size)¶
-
inline ChunkIndexerInput &with_checkpoint_idx(std::uint64_t idx)¶
-
inline ChunkIndexerInput &with_byte_range(std::size_t start, std::size_t end)¶
-
inline ChunkIndexerInput &with_config(const ChunkIndexerConfig &cfg)¶
-
inline ChunkIndexerInput &with_batch_size(std::size_t size)¶
-
inline ChunkIndexerInput &with_hash_maps(HashResolveMap hh, HashResolveMap fh, HashResolveMap sh)¶
Public Members
-
std::string file_path¶
-
std::string index_path¶
-
std::size_t checkpoint_size = 0¶
-
std::uint64_t checkpoint_idx = 0¶
-
std::size_t start_byte = 0¶
-
std::size_t end_byte = 0¶
-
ChunkIndexerConfig config¶
-
std::size_t batch_size = 4 * 1024 * 1024¶
-
HashResolveMap hhash_map¶
-
HashResolveMap fhash_map¶
-
HashResolveMap shash_map¶
-
std::shared_ptr<ChunkIndexState> existing_state¶
-
inline ChunkIndexerInput &with_file_path(const std::string &path)¶
-
struct ChunkIndexerOutput¶
Public Members
-
std::uint64_t checkpoint_idx = 0¶
-
std::unordered_map<std::string, BloomFilter> bloom_filters¶
-
ChunkStatistics statistics¶
-
HashResolutions hash_resolutions¶
-
std::size_t events_processed = 0¶
-
bool success = false¶
-
std::vector<EventLineGroup> event_line_groups¶
-
std::vector<MetadataLineGroup> metadata_line_groups¶
-
std::uint64_t checkpoint_idx = 0¶
-
class ChunkIndexerUtility : public dftracer::utils::utilities::Utility<ChunkIndexerInput, ChunkIndexerOutput, utilities::tags::Parallelizable>¶
Public Functions
-
ChunkIndexerUtility() = default¶
-
coro::CoroTask<ChunkIndexerOutput> process(const ChunkIndexerInput &input) override¶
-
ChunkIndexerUtility() = default¶
-
struct ChunkPrunerBatchInput¶
Public Members
-
std::string index_path¶
-
std::vector<ChunkPrunerBatchItem> items¶
-
BloomFilterCache *cache = nullptr¶
-
indexer::IndexDatabase *external_db = nullptr¶
-
std::string index_path¶
-
struct ChunkPrunerBatchItem¶
Input for batched pruning across many files that share the same
.dftindexstore. Allows a single RocksDB scan per column family to populate per-file pruner contexts instead of one scan per file.
-
struct ChunkPrunerBatchOutput¶
-
struct ChunkPrunerInput¶
Input for chunk pruning: index path, file path, query, optional cache.
If
external_dbis non-null the utility reuses that handle instead of opening the RocksDB atindex_pathitself. This lets callers that prune many files against the same directory-level index amortize the (expensive) RocksDB open cost to once per batch rather than once per file.Public Members
-
std::string index_path¶
Path to the
.dftindexstore.
-
std::string file_path¶
Path to trace file.
-
Query query¶
Query to evaluate for pruning.
-
BloomFilterCache *cache = nullptr¶
Optional bloom filter cache.
-
indexer::IndexDatabase *external_db = nullptr¶
Reused DB handle.
-
std::string index_path¶
-
struct ChunkPrunerOutput¶
Result of chunk pruning.
-
class ChunkPrunerUtility : public dftracer::utils::utilities::Utility<ChunkPrunerInput, ChunkPrunerOutput, utilities::tags::Parallelizable>¶
Three-tier chunk pruner: dictionary → min/max range → bloom filter. Walks the Query AST recursively (AND=intersect, OR=union, NOT=complement).
Public Functions
-
ChunkPrunerUtility() = default¶
-
coro::CoroTask<ChunkPrunerOutput> process(const ChunkPrunerInput &input) override¶
-
ChunkPrunerBatchOutput process_batch(const ChunkPrunerBatchInput &input)¶
Batch-prune many files against the same index with shared RocksDB range scans for dim_stats / chunk_statistics.
-
ChunkPrunerUtility() = default¶
-
struct ChunkStatistics¶
Per-chunk statistics for DFTracer events.
Tracks event counts by category/name/pid:tid, timestamp ranges, and duration statistics using Welford’s online algorithm for variance. Map fields serialize to JSON text for storage in the shared
.dftindexdatabase.Public Functions
-
void update_from_event(std::string_view name, std::string_view cat, std::uint64_t pid, std::uint64_t tid, std::uint64_t ts, std::uint64_t dur)¶
-
void merge_from(const ChunkStatistics &other)¶
-
double duration_mean() const¶
-
double duration_variance() const¶
-
std::string name_category_json() const¶
-
std::string name_duration_histograms_json() const¶
-
std::string name_duration_sums_json() const¶
-
std::string name_duration_sum_sqs_json() const¶
-
std::vector<std::uint8_t> serialize_name_duration_sketches() const¶
Serialize per-name DDSketches to a single binary blob.
Public Members
-
std::uint64_t total_events = 0¶
-
StringViewMap<std::uint64_t> category_counts¶
-
StringViewMap<std::uint64_t> name_counts¶
-
StringViewMap<std::uint64_t> pid_tid_counts¶
-
std::uint64_t min_timestamp_us = std::numeric_limits<std::uint64_t>::max()¶
-
std::uint64_t max_timestamp_us = 0¶
-
std::uint64_t duration_count = 0¶
-
std::int64_t duration_sum_us = 0¶
-
std::uint64_t duration_min_us = std::numeric_limits<std::uint64_t>::max()¶
-
std::uint64_t duration_max_us = 0¶
-
double duration_m2 = 0.0¶
-
common::statistics::Log2Histogram duration_histogram¶
-
common::statistics::TimestampHistogram timestamp_histogram¶
-
StringViewMap<common::statistics::Log2Histogram> name_duration_histograms¶
-
StringViewMap<double> name_duration_sums¶
-
StringViewMap<double> name_duration_sum_sqs¶
-
StringViewMap<std::string> name_category¶
Public Static Functions
-
static StringViewMap<std::string> parse_string_map_json(const std::string &json)¶
-
static StringViewMap<double> parse_double_map_json(const std::string &json)¶
-
static StringViewMap<common::statistics::Log2Histogram> parse_histogram_map_json(const std::string &json)¶
-
void update_from_event(std::string_view name, std::string_view cat, std::uint64_t pid, std::uint64_t tid, std::uint64_t ts, std::uint64_t dur)¶
-
struct EventLineGroup¶
-
struct FileWorkItem¶
-
class IndexResolverUtility : public dftracer::utils::utilities::Utility<ResolverInput, ResolverResult, utilities::tags::Parallelizable, utilities::tags::NeedsContext>¶
Public Functions
-
coro::CoroTask<ResolverResult> process(const ResolverInput &input) override¶
-
coro::CoroTask<ResolverResult> process(const ResolverInput &input) override¶
-
struct IndexedDimensions¶
Public Functions
-
inline bool has_dimension(const std::string &dim) const¶
-
inline void add_dimension(const std::string &dim)¶
-
inline std::vector<std::string> missing_dimensions(const ChunkIndexerConfig &config) const¶
Public Members
-
std::vector<std::string> dimensions¶
-
inline bool has_dimension(const std::string &dim) const¶
-
struct MetadataLineGroup¶
-
struct ResolveAndBuildInput¶
Public Members
-
std::string directory¶
-
std::vector<std::string> files¶
-
std::string index_dir¶
-
std::size_t checkpoint_size = 32 * 1024 * 1024¶
-
std::size_t parallelism = 0¶
-
bool force_rebuild = false¶
-
bool require_checkpoints = true¶
-
bool require_bloom = false¶
-
bool require_manifest = false¶
-
bool require_aggregation = false¶
-
std::optional<aggregators::AggregationConfig> aggregation_config¶
-
std::string directory¶
-
struct ResolvedFile¶
-
struct ResolverInput¶
-
struct ResolverResult¶
Public Functions
-
inline std::size_t total_needs_work() const¶
-
inline std::size_t total_cached() const¶
Public Members
-
std::vector<std::string> all_files¶
-
std::vector<std::size_t> all_file_sizes¶
-
std::string index_path¶
-
std::vector<FileWorkItem> needs_checkpoint¶
-
std::vector<FileWorkItem> needs_bloom¶
-
std::vector<FileWorkItem> needs_manifest¶
-
std::vector<FileWorkItem> needs_aggregation¶
-
std::vector<ResolvedFile> cached¶
-
bool needs_augmentation = false¶
-
std::uint64_t stored_time_interval_us = 0¶
-
inline std::size_t total_needs_work() const¶