DFTracer Bloom Filter Indexing

Namespace: dftracer::utils::utilities::composites::dft::indexing

For usage guide and examples, see DFTracer Indexing System.

class BloomFilter

Split block Bloom filter for approximate set membership testing.

Implements the split block Bloom filter from the Apache Parquet spec: 256-bit blocks of 8 x uint32 words; each insert/query touches exactly one block (one cache line) and sets/tests one bit in each of the 8 words via a fixed SALT array. Block selection uses Lemire’s reduction on h1; in-block masks use h2 multiplied by SALT.

References:

Differs from canonical Parquet:

  • Underlying hash is FNV1a + SplitMix64 finisher (not xxhash64).

  • Custom 12-byte LE header (num_hashes, num_entries, num_bits) instead of Thrift; num_hashes is unused at insert/test (vestigial).

Serialization format (self-describing): [4 bytes: num_hashes (uint32_t LE)] [4 bytes: num_entries (uint32_t LE)] [4 bytes: num_bits (uint32_t LE)] [remaining: bit array bytes]

Public Functions

explicit BloomFilter(std::size_t expected_entries = 1024, double false_positive_rate = 0.01)
void add(std::string_view value)
bool possibly_contains(std::string_view value) const
void merge_from(const BloomFilter &other)
std::vector<unsigned char> serialize() const
void serialize_into(std::vector<unsigned char> &result) const
inline std::size_t num_entries() const
inline std::size_t size_bytes() const
inline std::size_t num_hash_functions() const
inline std::size_t num_bits() const

Public Static Functions

static BloomFilter from_blob(const unsigned char *data, std::size_t size)
class BloomFilterCache

Thread-safe bounded cache for deserialized bloom filters. Keyed by (index_path, dimension, checkpoint_idx) for chunk blooms, or (index_path, dimension, UINT64_MAX) for file-level blooms. When the cache exceeds max_entries, it is cleared entirely.

Public Functions

inline explicit BloomFilterCache(std::size_t max_entries = DEFAULT_MAX_ENTRIES)
inline std::optional<BloomFilter> get(const std::string &index_path, const std::string &dimension, std::uint64_t checkpoint_idx) const

Look up a cached bloom filter. Returns nullopt on miss.

inline void put(const std::string &index_path, const std::string &dimension, std::uint64_t checkpoint_idx, const BloomFilter &bloom)

Insert a bloom filter into the cache. Evicts all entries if full.

inline std::size_t size() const

Public Static Attributes

static constexpr std::size_t DEFAULT_MAX_ENTRIES = 10000
static constexpr std::uint64_t FILE_LEVEL_SENTINEL = UINT64_MAX
struct ChunkDimensionStats

Per-dimension per-chunk metadata for query optimization.

Public Functions

ChunkDimensionStats() = default
inline ChunkDimensionStats(const ChunkDimensionStats &other)
inline ChunkDimensionStats(ChunkDimensionStats &&other) noexcept
inline ChunkDimensionStats &operator=(const ChunkDimensionStats &other)
inline ChunkDimensionStats &operator=(ChunkDimensionStats &&other) noexcept
void observe(std::string_view value)

Record a value observation. Updates min/max, distinct_count, value_counts.

void observe_range_only(std::uint64_t value)
std::vector<std::uint8_t> serialize_value_counts() const

Serialize value_counts to binary format: [u32 LE num_entries] [u16 LE key_len, key bytes, u64 LE count]*

std::optional<std::vector<std::uint8_t>> compress_value_counts(std::size_t cap_bytes = 4096) const

Compress serialized value_counts with zlib. Returns nullopt if compressed size exceeds cap_bytes.

Public Members

std::string dimension

Dimension name (e.g., “cat”, “name”).

std::uint64_t distinct_count = 0

Number of unique values.

std::string min_value

Minimum value (numeric-aware for uint/int/double).

std::string max_value

Maximum value.

std::string value_type = "string"

“string”, “uint”, “int”, or “double”.

std::optional<dftracer::utils::StringViewMap<std::uint64_t>> value_counts
const std::string *last_key_ = nullptr
std::uint64_t *last_counter_ = nullptr

Public Static Functions

static dftracer::utils::StringViewMap<std::uint64_t> deserialize_value_counts(const std::uint8_t *data, std::size_t len)
static dftracer::utils::StringViewMap<std::uint64_t> decompress_value_counts(const std::uint8_t *data, std::size_t len)

Decompress zlib-compressed value_counts, then deserialize.

struct ChunkDimensionStatsResult

Result type for querying chunk_dimension_stats from the shared index DB.

Public Functions

inline bool has_value_counts_payload() const
inline void ensure_value_counts_decoded() const

Public Members

std::uint64_t checkpoint_idx
std::string dimension
std::uint64_t distinct_count
std::string min_value
std::string max_value
std::string value_type
mutable std::optional<dftracer::utils::StringViewMap<std::uint64_t>> value_counts
mutable std::vector<std::uint8_t> compressed_value_counts
struct ChunkIndexState

Public Members

std::uint64_t checkpoint_idx = 0
std::size_t events_processed = 0
IndexedDimensions indexed_dims
HashResolutions hash_resolutions
ChunkStatistics statistics
std::size_t config_hash = 0
struct ChunkIndexerConfig

Public Functions

inline std::size_t compute_hash() const

Public Members

bool index_name = true
bool index_cat = true
bool index_pid = true
bool index_tid = true
bool index_hhash = true
bool index_fhash = true
bool index_shash = true
std::vector<std::string> extra_dimensions
std::size_t expected_entries_per_chunk = 1024
double false_positive_rate = 0.01
bool build_manifest = false
std::size_t value_counts_cap = 4096
struct ChunkIndexerInput

Public Functions

inline ChunkIndexerInput &with_file_path(const std::string &path)
inline ChunkIndexerInput &with_index_path(const std::string &path)
inline ChunkIndexerInput &with_checkpoint_size(std::size_t size)
inline ChunkIndexerInput &with_checkpoint_idx(std::uint64_t idx)
inline ChunkIndexerInput &with_byte_range(std::size_t start, std::size_t end)
inline ChunkIndexerInput &with_config(const ChunkIndexerConfig &cfg)
inline ChunkIndexerInput &with_batch_size(std::size_t size)
inline ChunkIndexerInput &with_hash_maps(HashResolveMap hh, HashResolveMap fh, HashResolveMap sh)
inline ChunkIndexerInput &with_existing_state(std::shared_ptr<ChunkIndexState> state)

Public Members

std::string file_path
std::string index_path
std::size_t checkpoint_size = 0
std::uint64_t checkpoint_idx = 0
std::size_t start_byte = 0
std::size_t end_byte = 0
ChunkIndexerConfig config
std::size_t batch_size = 4 * 1024 * 1024
HashResolveMap hhash_map
HashResolveMap fhash_map
HashResolveMap shash_map
std::shared_ptr<ChunkIndexState> existing_state
struct ChunkIndexerOutput

Public Members

std::uint64_t checkpoint_idx = 0
std::unordered_map<std::string, BloomFilter> bloom_filters
ChunkStatistics statistics
HashResolutions hash_resolutions
std::size_t events_processed = 0
bool success = false
std::vector<EventLineGroup> event_line_groups
std::vector<MetadataLineGroup> metadata_line_groups
class ChunkIndexerUtility : public dftracer::utils::utilities::Utility<ChunkIndexerInput, ChunkIndexerOutput, utilities::tags::Parallelizable>

Public Functions

ChunkIndexerUtility() = default
coro::CoroTask<ChunkIndexerOutput> process(const ChunkIndexerInput &input) override
struct ChunkPrunerBatchInput

Public Members

std::string index_path
std::vector<ChunkPrunerBatchItem> items
BloomFilterCache *cache = nullptr
indexer::IndexDatabase *external_db = nullptr
struct ChunkPrunerBatchItem

Input for batched pruning across many files that share the same .dftindex store. Allows a single RocksDB scan per column family to populate per-file pruner contexts instead of one scan per file.

Public Members

std::string file_path
Query query
struct ChunkPrunerBatchOutput

Public Members

std::vector<ChunkPrunerOutput> outputs

Parallel to items[].

bool success = false
struct ChunkPrunerInput

Input for chunk pruning: index path, file path, query, optional cache.

If external_db is non-null the utility reuses that handle instead of opening the RocksDB at index_path itself. This lets callers that prune many files against the same directory-level index amortize the (expensive) RocksDB open cost to once per batch rather than once per file.

Public Members

std::string index_path

Path to the .dftindex store.

std::string file_path

Path to trace file.

Query query

Query to evaluate for pruning.

BloomFilterCache *cache = nullptr

Optional bloom filter cache.

indexer::IndexDatabase *external_db = nullptr

Reused DB handle.

struct ChunkPrunerOutput

Result of chunk pruning.

Public Members

bool file_may_match = false

True if any chunk may match.

std::vector<std::uint64_t> candidate_checkpoints

Matching chunk indices.

std::uint64_t total_checkpoints = 0

Total chunks in file.

bool success = false

True if pruning completed without error.

class ChunkPrunerUtility : public dftracer::utils::utilities::Utility<ChunkPrunerInput, ChunkPrunerOutput, utilities::tags::Parallelizable>

Three-tier chunk pruner: dictionary → min/max range → bloom filter. Walks the Query AST recursively (AND=intersect, OR=union, NOT=complement).

Public Functions

ChunkPrunerUtility() = default
coro::CoroTask<ChunkPrunerOutput> process(const ChunkPrunerInput &input) override
ChunkPrunerBatchOutput process_batch(const ChunkPrunerBatchInput &input)

Batch-prune many files against the same index with shared RocksDB range scans for dim_stats / chunk_statistics.

struct ChunkStatistics

Per-chunk statistics for DFTracer events.

Tracks event counts by category/name/pid:tid, timestamp ranges, and duration statistics using Welford’s online algorithm for variance. Map fields serialize to JSON text for storage in the shared .dftindex database.

Public Functions

void update_from_event(std::string_view name, std::string_view cat, std::uint64_t pid, std::uint64_t tid, std::uint64_t ts, std::uint64_t dur)
void merge_from(const ChunkStatistics &other)
double duration_mean() const
double duration_variance() const
std::string name_category_json() const
std::string name_duration_histograms_json() const
std::string name_duration_sums_json() const
std::string name_duration_sum_sqs_json() const
std::vector<std::uint8_t> serialize_name_duration_sketches() const

Serialize per-name DDSketches to a single binary blob.

Public Members

std::uint64_t total_events = 0
StringViewMap<std::uint64_t> category_counts
StringViewMap<std::uint64_t> name_counts
StringViewMap<std::uint64_t> pid_tid_counts
std::uint64_t min_timestamp_us = std::numeric_limits<std::uint64_t>::max()
std::uint64_t max_timestamp_us = 0
std::uint64_t duration_count = 0
std::int64_t duration_sum_us = 0
std::uint64_t duration_min_us = std::numeric_limits<std::uint64_t>::max()
std::uint64_t duration_max_us = 0
double duration_m2 = 0.0
common::statistics::DDSketch duration_sketch = {0.01}
common::statistics::Log2Histogram duration_histogram
common::statistics::TimestampHistogram timestamp_histogram
StringViewMap<common::statistics::DDSketch> name_duration_sketches
StringViewMap<common::statistics::Log2Histogram> name_duration_histograms
StringViewMap<double> name_duration_sums
StringViewMap<double> name_duration_sum_sqs
StringViewMap<std::string> name_category

Public Static Functions

static StringViewMap<std::string> parse_string_map_json(const std::string &json)
static StringViewMap<double> parse_double_map_json(const std::string &json)
static StringViewMap<common::statistics::Log2Histogram> parse_histogram_map_json(const std::string &json)
static StringViewMap<common::statistics::DDSketch> deserialize_name_duration_sketches(const std::uint8_t *data, std::size_t len)
struct EventLineGroup

Public Members

std::string cat
std::string name
std::vector<std::uint32_t> line_numbers
struct FileWorkItem

Public Members

std::size_t file_index = 0
std::string file_path
std::int32_t file_id = -1
class IndexResolverUtility : public dftracer::utils::utilities::Utility<ResolverInput, ResolverResult, utilities::tags::Parallelizable, utilities::tags::NeedsContext>

Public Functions

coro::CoroTask<ResolverResult> process(const ResolverInput &input) override
struct IndexedDimensions

Public Functions

inline bool has_dimension(const std::string &dim) const
inline void add_dimension(const std::string &dim)
inline std::vector<std::string> missing_dimensions(const ChunkIndexerConfig &config) const

Public Members

std::vector<std::string> dimensions
struct MetadataLineGroup

Public Members

std::string meta_type
std::vector<std::uint32_t> line_numbers
struct ResolveAndBuildInput

Public Members

std::string directory
std::vector<std::string> files
std::string index_dir
std::size_t checkpoint_size = 32 * 1024 * 1024
std::size_t parallelism = 0
bool force_rebuild = false
bool require_checkpoints = true
bool require_bloom = false
bool require_manifest = false
bool require_aggregation = false
std::optional<aggregators::AggregationConfig> aggregation_config
struct ResolvedFile

Public Members

std::size_t file_index = 0
std::string file_path
std::int32_t file_id = -1
indexer::IndexFileEntryCapability capabilities = indexer::IndexFileEntryCapability::NONE
struct ResolverInput

Public Members

std::string directory
std::string index_dir
std::vector<std::string> files
bool require_checkpoints = true
bool require_bloom = false
bool require_manifest = false
bool require_aggregation = false
std::optional<aggregators::AggregationConfig> aggregation_config
struct ResolverResult

Public Functions

inline std::size_t total_needs_work() const
inline std::size_t total_cached() const

Public Members

std::vector<std::string> all_files
std::vector<std::size_t> all_file_sizes
std::string index_path
std::vector<FileWorkItem> needs_checkpoint
std::vector<FileWorkItem> needs_bloom
std::vector<FileWorkItem> needs_manifest
std::vector<FileWorkItem> needs_aggregation
std::vector<ResolvedFile> cached
bool needs_augmentation = false
std::uint64_t stored_time_interval_us = 0