Indexer Components¶
See also
For complete class and member documentation, see the API Reference.
Indexing and searching functionality for compressed trace files.
All classes are in the dftracer::utils::utilities::indexer namespace.
classDiagram
class dftracer__utils__utilities__indexer__FileMetadataResult["FileMetadataResult"]
class dftracer__utils__utilities__indexer__FileRegistryEntry["FileRegistryEntry"]
class dftracer__utils__utilities__indexer__IndexBatchBuilderUtility["IndexBatchBuilderUtility"]
dftracer__utils__utilities__indexer__IndexBatchBuilderUtility : +process() CoroTask
class dftracer__utils__utilities__indexer__IndexBatchSink["IndexBatchSink"]
<<abstract>> dftracer__utils__utilities__indexer__IndexBatchSink
dftracer__utils__utilities__indexer__IndexBatchSink : +insert_file_metadata() void
dftracer__utils__utilities__indexer__IndexBatchSink : +insert_checkpoint() void
dftracer__utils__utilities__indexer__IndexBatchSink : +insert_event_range() void
class dftracer__utils__utilities__indexer__IndexBuildBatchConfig["IndexBuildBatchConfig"]
class dftracer__utils__utilities__indexer__IndexBuildBatchMetrics["IndexBuildBatchMetrics"]
class dftracer__utils__utilities__indexer__IndexBuildBatchResult["IndexBuildBatchResult"]
class dftracer__utils__utilities__indexer__IndexBuildConfig["IndexBuildConfig"]
dftracer__utils__utilities__indexer__IndexBuildConfig : +for_file() IndexBuildConfig
dftracer__utils__utilities__indexer__IndexBuildConfig : +with_index_dir() IndexBuildConfig &
dftracer__utils__utilities__indexer__IndexBuildConfig : +with_checkpoint_size() IndexBuildConfig &
class dftracer__utils__utilities__indexer__IndexBuildResult["IndexBuildResult"]
class dftracer__utils__utilities__indexer__IndexBuilderUtility["IndexBuilderUtility"]
dftracer__utils__utilities__indexer__IndexBuilderUtility : +process() CoroTask
class dftracer__utils__utilities__indexer__IndexDatabase["IndexDatabase"]
dftracer__utils__utilities__indexer__IndexDatabase : +begin_write() unique_ptr
dftracer__utils__utilities__indexer__IndexDatabase : +bulk_ingest() void
dftracer__utils__utilities__indexer__IndexDatabase : +rebuild_root_summaries() void
class dftracer__utils__utilities__indexer__IndexDatabaseSstWriterContext["IndexDatabaseSstWriterContext"]
dftracer__utils__utilities__indexer__IndexDatabaseSstWriterContext : +insert_file_metadata() void
dftracer__utils__utilities__indexer__IndexDatabaseSstWriterContext : +insert_checkpoint() void
dftracer__utils__utilities__indexer__IndexDatabaseSstWriterContext : +insert_event_range() void
class dftracer__utils__utilities__indexer__IndexDatabaseWriterContext["IndexDatabaseWriterContext"]
dftracer__utils__utilities__indexer__IndexDatabaseWriterContext : +commit() void
dftracer__utils__utilities__indexer__IndexDatabaseWriterContext : +has_file_scalar_stats() bool
dftracer__utils__utilities__indexer__IndexDatabaseWriterContext : +init_schema() void
class dftracer__utils__utilities__indexer__IndexVisitor["IndexVisitor"]
<<abstract>> dftracer__utils__utilities__indexer__IndexVisitor
dftracer__utils__utilities__indexer__IndexVisitor : +begin() void
dftracer__utils__utilities__indexer__IndexVisitor : +on_checkpoint() CoroTask
dftracer__utils__utilities__indexer__IndexVisitor : +on_chunk() CoroTask
class dftracer__utils__utilities__indexer__MergedStatisticsResult["MergedStatisticsResult"]
class dftracer__utils__utilities__indexer__NameSummaryResult["NameSummaryResult"]
class dftracer__utils__utilities__indexer__ProvenanceDatabase["ProvenanceDatabase"]
dftracer__utils__utilities__indexer__ProvenanceDatabase : +init_schema() void
dftracer__utils__utilities__indexer__ProvenanceDatabase : +get_or_create_file_info() int
dftracer__utils__utilities__indexer__ProvenanceDatabase : +get_file_info_id() int
class dftracer__utils__utilities__indexer__RootStatisticsResult["RootStatisticsResult"]
class dftracer__utils__utilities__indexer__SstArtifactRegistry["SstArtifactRegistry"]
dftracer__utils__utilities__indexer__SstArtifactRegistry : +append() void
dftracer__utils__utilities__indexer__SstArtifactRegistry : +metadata() vector &
dftracer__utils__utilities__indexer__SstArtifactRegistry : +checkpoints() vector &
class dftracer__utils__utilities__indexer__TarArchiveMetadata["TarArchiveMetadata"]
class dftracer__utils__utilities__indexer__TarFileRecord["TarFileRecord"]
class dftracer__utils__utilities__indexer__internal__Cursor["Cursor"]
dftracer__utils__utilities__indexer__internal__Cursor : +u8() uint8_t
dftracer__utils__utilities__indexer__internal__Cursor : +u32() uint32_t
dftracer__utils__utilities__indexer__internal__Cursor : +u64() uint64_t
class dftracer__utils__utilities__indexer__internal__DecodeContextGuard["DecodeContextGuard"]
class dftracer__utils__utilities__indexer__internal__Indexer["Indexer"]
<<abstract>> dftracer__utils__utilities__indexer__internal__Indexer
dftracer__utils__utilities__indexer__internal__Indexer : +build_async() CoroTask
dftracer__utils__utilities__indexer__internal__Indexer : +build() void
dftracer__utils__utilities__indexer__internal__Indexer : +need_rebuild() bool
class dftracer__utils__utilities__indexer__internal__IndexerCheckpoint["IndexerCheckpoint"]
class dftracer__utils__utilities__indexer__internal__IndexerError["IndexerError"]
dftracer__utils__utilities__indexer__internal__IndexerError : +type() Type
class dftracer__utils__utilities__indexer__internal__IndexerFactory["IndexerFactory"]
dftracer__utils__utilities__indexer__internal__IndexerFactory : +create() shared_ptr
dftracer__utils__utilities__indexer__internal__IndexerFactory : +detect_format() ArchiveFormat
dftracer__utils__utilities__indexer__internal__IndexerFactory : +generate_index_path() string
class dftracer__utils__utilities__Utility["Utility"]
style dftracer__utils__utilities__Utility fill:#e0e0e0,stroke:#999
dftracer__utils__utilities__Utility <|-- dftracer__utils__utilities__indexer__IndexBuilderUtility
dftracer__utils__utilities__indexer__IndexBatchSink <|-- dftracer__utils__utilities__indexer__IndexDatabaseSstWriterContext
dftracer__utils__utilities__indexer__IndexBatchSink <|-- dftracer__utils__utilities__indexer__IndexDatabaseWriterContext
Overview¶
The indexer module provides sidecar index files (.idx) for efficient
random access to compressed trace files. Indexes store:
Checkpoints: Byte offsets and decompression state for random access
Bloom filters: Per-chunk probabilistic membership tests for event filtering
Manifests: Per-checkpoint event line routing tables for reorganization
Chunk statistics: Per-chunk event counts, timestamps, duration distributions
A separate provenance database (.pidx) records source-to-output mappings
produced during reorganization.
Getting Started¶
Build an index for a compressed trace file using the fluent configuration API:
#include <dftracer/utils/utilities/indexer/index_builder_utility.h>
using namespace dftracer::utils::utilities::indexer;
auto config = IndexBuildConfig::for_file("trace.pfw.gz")
.with_index_dir("/tmp/indexes")
.with_checkpoint_size(32 * 1024 * 1024)
.with_bloom(true)
.with_manifest(true)
.with_bloom_dimensions(default_bloom_dimensions());
IndexBuilderUtility builder;
IndexBuildResult result = co_await builder.process(config);
if (result.success) {
// result.idx_path contains the path to the .idx sidecar file
// result.events_processed, result.chunks_processed hold stats
}
Once an index exists, open it directly with IndexDatabase to query bloom
filters, manifests, or chunk statistics:
#include <dftracer/utils/utilities/indexer/index_database.h>
IndexDatabase db("trace.pfw.gz.idx");
int file_id = db.find_file("trace.pfw.gz");
// Query time bounds across all chunks
auto bounds = db.query_time_bounds(file_id);
// Query bloom filters for a specific dimension
auto blooms = db.query_chunk_bloom_filters(file_id, "name");
// Query per-checkpoint event routing manifests
auto ranges = db.query_event_ranges(file_id);
IndexBuildConfig¶
Fluent builder for configuring an index build pass. Start with the
static factory for_file() and chain with_* methods:
with_index_dir(dir)– output directory for.idxfileswith_checkpoint_size(bytes)– decompression checkpoint interval (default 32 MB)with_index_threshold(bytes)– minimum file size to indexwith_force_rebuild(true)– rebuild even if an index already existswith_bloom(true)– enable per-chunk bloom filter constructionwith_manifest(true)– enable per-checkpoint event routing manifestswith_bloom_dimensions(dims)– which JSON fields to index (default: name, cat, pid, tid, hhash, fhash, shash)
IndexBuildResult¶
Returned by IndexBuilderUtility::process(). Contains:
idx_path– path to the produced.idxsidecarsuccess/was_skipped/index_created– outcome flagsevents_processed/chunks_processed/total_lines– build statisticserror_message– non-empty on failure
IndexBuilderUtility¶
Coroutine-based utility that drives the full index build pipeline. Extends
Utility<IndexBuildConfig, IndexBuildResult, tags::NeedsContext> and requires
an executor context to run. Call process(config) inside a coroutine to
build the index asynchronously.
IndexBuilderUtility builder;
IndexBuildResult result = co_await builder.process(config);
IndexDatabase¶
SQLite-backed .idx sidecar file that stores all index data for a trace
file. Schema is additive – call init_base_schema() always, then
init_bloom_schema() and/or init_manifest_schema() as needed.
Provides methods for inserting and querying:
Bloom data:
insert_chunk_bloom_filter(),query_chunk_bloom_filters(),query_file_bloom_filter()Chunk statistics:
insert_chunk_statistics(),query_chunk_statistics(),query_time_bounds()Dimension stats:
insert_chunk_dimension_stats(),query_chunk_dimension_stats()Hash resolutions:
insert_hash_resolution(),query_resolved_by_hash(),query_hash_by_resolved()Manifests:
insert_event_range(),query_event_ranges(),insert_metadata_lines(),query_metadata_lines()
IndexDatabase db("trace.pfw.gz.idx");
db.init_base_schema();
db.init_bloom_schema();
int file_id = db.get_or_create_file_info("trace.pfw.gz", file_hash);
db.insert_chunk_statistics(file_id, checkpoint_idx, stats);
IndexVisitor¶
Abstract visitor interface for index building passes. Implement this to add custom indexing logic during the checkpoint-by-checkpoint scan. The builder calls visitors in order:
begin(num_checkpoints)– called once before the scan startson_checkpoint(idx)– called at each checkpoint boundaryon_line(line, checkpoint_idx)– called for every line in the filefinalize(db, file_id)– called once after the scan to persist results
Indexer / CheckpointIndexer¶
The low-level checkpoint indexer is exposed as Indexer (formerly named
BatchIndexer); the previous Indexer class is now CheckpointIndexer
in the internal namespace. SingleFileIndexer has been removed; use
IndexBuilderUtility or IndexBatchBuilderUtility instead.
IndexBatchBuilderUtility¶
Batched variant of IndexBuilderUtility that processes a list of files in
parallel against a shared IndexDatabaseWriterContext, yielding an
IndexBuildBatchResult with aggregated metrics. Configured via
IndexBuildBatchConfig (file list, parallelism, checkpoint size, bloom and
manifest toggles, shared sink).
IndexBuildBatchConfig¶
Configuration struct for IndexBatchBuilderUtility: file slices, output
directory, checkpoint size, bloom/manifest flags, and the shared
IndexBatchSink (typically an IndexDatabaseWriterContext) that
receives encoded batches from all workers.
IndexDatabaseWriterContext¶
Implements IndexBatchSink and owns a thread-safe writer pipeline into a
RocksDB-backed IndexDatabase. Workers in IndexBatchBuilderUtility
submit encoded index batches to this context, which serializes them into
checkpoint, bloom, manifest, and statistics column families.
BloomVisitor¶
Implements DftEventVisitor to build per-chunk bloom filters and
statistics during the indexing scan. Each checkpoint chunk gets its own set
of bloom filters (one per configured dimension) plus per-chunk event counts
and timestamp/duration distributions.
#include <dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.h>
BloomVisitor visitor(bloom_config, {"name", "cat", "pid"});
visitor.begin(num_checkpoints);
for (auto& [checkpoint_idx, line] : lines) {
visitor.on_checkpoint(checkpoint_idx);
visitor.on_line(line, checkpoint_idx);
}
visitor.finalize(db, file_id);
ManifestVisitor¶
Implements DftEventVisitor to build per-checkpoint event routing
manifests. During the scan, it collects which lines belong to which
(cat, name) event pair within each checkpoint. The resulting manifests
enable the reorganization pipeline to selectively read only the lines needed
for a given event group.
#include <dftracer/utils/utilities/composites/dft/visitors/manifest_visitor.h>
ManifestVisitor visitor;
visitor.begin(num_checkpoints);
// ... scan lines ...
visitor.finalize(db, file_id);
// Later, query the manifest:
auto ranges = db.query_event_ranges_for_checkpoint(file_id, checkpoint_idx);
IndexResolverUtility¶
Resolves a directory or file list into a set of FileWorkItem entries by
opening or building per-file indexes and emitting line-range work items
suitable for parallel scan / aggregation / replay pipelines. Defined in
dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h.
ProvenanceDatabase¶
SQLite-backed .pidx sidecar that records the full reorganization provenance
of an output file: which source files contributed, which checkpoints were read,
and which line ranges map to which output lines.
Schema tables:
file_info– output file identity (path + hash)provenance_info– key/value metadata (tool version, timestamp, etc.)provenance_sources– source files that contributed to this outputprovenance_group– named predicate groups used during reorganizationprovenance_segments– per-checkpoint line range mappings
#include <dftracer/utils/utilities/indexer/provenance_database.h>
ProvenanceDatabase pdb("output.pfw.gz.pidx");
pdb.init_schema();
int fid = pdb.get_or_create_file_info("output.pfw.gz", file_hash);
pdb.insert_info("version", "1.0");
pdb.insert_source(fid, 0, "source.pfw.gz", num_checkpoints);
pdb.insert_segment(0, checkpoint_idx, out_start, out_end, event_count);
// Query provenance later
auto sources = pdb.query_sources(fid);
auto segments = pdb.query_segments(source_idx);