Examples

This section provides practical examples of using pydftracer in different scenarios.

Python Examples

Application Level Example

This example demonstrates using pydftracer at the application level with explicit initialization and function decorators.

 1 from dftracer.python import dftracer, dft_fn
 2 import numpy as np
 3 import os
 4 from time import sleep
 5 from multiprocessing import get_context
 6
 7 # Initialize DFTracer
 8 log_inst = dftracer.initialize_log(logfile=None, data_dir=None, process_id=-1)
 9
10 # Create function tracer for compute operations
11 compute_tracer = dft_fn("COMPUTE")
12
13 # Example of using function decorators
14 @compute_tracer.log
15 def log_events(index):
16     sleep(1)
17
18 # Example of function spawning and implicit I/O calls
19 def posix_calls(val):
20     index, is_spawn = val
21     cwd = os.getcwd()
22     path = f"{cwd}/data/demofile{index}.txt"
23     f = open(path, "w+")
24     f.write("Now the file has more content!")
25     f.close()
26
27     if is_spawn:
28         print(f"Calling spawn on {index} with pid {os.getpid()}")
29         log_inst.finalize()  # Finalize DFTracer in spawned process
30     else:
31         print(f"Not calling spawn on {index} with pid {os.getpid()}")
32
33 # NPZ operations (internally calls POSIX)
34 def npz_calls(index):
35     cwd = os.getcwd()
36     path = f"{cwd}/data/demofile{index}.npz"
37     if os.path.exists(path):
38         os.remove(path)
39     records = np.random.randint(255, size=(8, 8, 1024), dtype=np.uint8)
40     record_labels = [0] * 1024
41     np.savez(path, x=records, y=record_labels)
42
43 def main():
44     log_events(0)
45     npz_calls(1)
46
47     # Spawn processes for parallel operations
48     with get_context('spawn').Pool(1) as pool:
49         pool.map(posix_calls, ((2, True),))
50
51     log_inst.finalize()
52
53 if __name__ == "__main__":
54     main()

Environment Configuration:

For this example, you need to set the following environment variables:

1 # Log file path (process ID, app name, and .pfw will be appended)
2 # Final log file: ~/log_file-<APP_NAME>-<PID>.pfw
3 export DFTRACER_LOG_FILE=~/log_file
4
5 # Colon-separated paths for profiling
6 export DFTRACER_DATA_DIR=/dev/shm/:/p/gpfs1/$USER/dataset:$PWD/data
7
8 # Enable DFTracer
9 export DFTRACER_ENABLE=1

LD_PRELOAD Example

This example shows using DFTracer with LD_PRELOAD for automatic I/O interception without explicit initialization in the code.

 1 import numpy as np
 2 import os
 3 from multiprocessing import get_context
 4
 5 # Example of function spawning and implicit I/O calls
 6 def posix_calls(val):
 7     index, is_spawn = val
 8     cwd = os.getcwd()
 9     path = f"{cwd}/data/demofile{index}.txt"
10     f = open(path, "w+")
11     f.write("Now the file has more content!")
12     f.close()
13
14     if is_spawn:
15         print(f"Calling spawn on {index} with pid {os.getpid()}")
16     else:
17         print(f"Not calling spawn on {index} with pid {os.getpid()}")
18
19 # NPZ operations (internally calls POSIX)
20 def npz_calls(index):
21     cwd = os.getcwd()
22     path = f"{cwd}/data/demofile{index}.npz"
23     if os.path.exists(path):
24         os.remove(path)
25     records = np.random.randint(255, size=(8, 8, 1024), dtype=np.uint8)
26     record_labels = [0] * 1024
27     np.savez(path, x=records, y=record_labels)
28
29 def main():
30     npz_calls(1)
31
32     with get_context('spawn').Pool(1) as pool:
33         pool.map(posix_calls, ((2, True),))
34
35 if __name__ == "__main__":
36     main()

Environment Configuration:

 1 # Log file path (process ID, app name, and .pfw will be appended)
 2 export DFTRACER_LOG_FILE=~/log_file
 3
 4 # Colon-separated paths for profiling
 5 export DFTRACER_DATA_DIR=/dev/shm/:/p/gpfs1/$USER/dataset
 6
 7 # Set initialization mode to PRELOAD
 8 export DFTRACER_INIT=PRELOAD
 9
10 # Enable DFTracer
11 export DFTRACER_ENABLE=1
12
13 # Run with LD_PRELOAD
14 LD_PRELOAD=/path/to/libdftracer_preload.so python your_script.py

Hybrid Mode Example

This example demonstrates the hybrid mode, combining both application-level initialization and LD_PRELOAD for comprehensive profiling.

 1 from dftracer.python import dftracer, dft_fn
 2 import numpy as np
 3 import os
 4 from time import sleep
 5 from multiprocessing import get_context
 6
 7 # Initialize DFTracer at application level
 8 log_inst = dftracer.initialize_log(logfile=None, data_dir=None, process_id=-1)
 9 compute_tracer = dft_fn("COMPUTE")
10
11 # Example of using function decorators
12 @compute_tracer.log
13 def log_events(index):
14     sleep(1)
15
16 # Example of function spawning and implicit I/O calls
17 def posix_calls(val):
18     index, is_spawn = val
19     cwd = os.getcwd()
20     path = f"{cwd}/data/demofile{index}.txt"
21     f = open(path, "w+")
22     f.write("Now the file has more content!")
23     f.close()
24
25     if is_spawn:
26         print(f"Calling spawn on {index} with pid {os.getpid()}")
27         log_inst.finalize()
28     else:
29         print(f"Not calling spawn on {index} with pid {os.getpid()}")
30
31 # NPZ operations
32 def npz_calls(index):
33     cwd = os.getcwd()
34     path = f"{cwd}/data/demofile{index}.npz"
35     if os.path.exists(path):
36         os.remove(path)
37     records = np.random.randint(255, size=(8, 8, 1024), dtype=np.uint8)
38     record_labels = [0] * 1024
39     np.savez(path, x=records, y=record_labels)
40
41 def main():
42     log_events(0)
43     npz_calls(1)
44
45     with get_context('spawn').Pool(1) as pool:
46         pool.map(posix_calls, ((2, True),))
47
48     log_inst.finalize()
49
50 if __name__ == "__main__":
51     main()

Environment Configuration:

 1 # Log file path
 2 export DFTRACER_LOG_FILE=~/log_file
 3
 4 # Data directories to profile
 5 export DFTRACER_DATA_DIR=/dev/shm/:/p/gpfs1/$USER/dataset
 6
 7 # Set to PRELOAD mode
 8 export DFTRACER_INIT=PRELOAD
 9
10 # Enable DFTracer
11 export DFTRACER_ENABLE=1
12
13 # Run with LD_PRELOAD
14 LD_PRELOAD=/path/to/libdftracer_preload.so python your_script.py

Deep Learning Example: ResNet50 on ALCF Polaris

This example shows how to profile a ResNet50 training workload using PyTorch and torchvision on the Polaris supercomputer at Argonne Leadership Computing Facility.

Environment Setup

Create a conda environment and install dependencies:

 1 #!/bin/bash +x
 2 set -e
 3 set -x
 4
 5 export MODULEPATH=/soft/modulefiles/conda/:$MODULEPATH
 6 module load 2023-10-04  # Latest conda module on Polaris
 7
 8 export ML_ENV=$PWD/PolarisAT/conda-envs/ml_workload_latest_conda_2
 9
10 if [[ -e $ML_ENV ]]; then
11     conda activate $ML_ENV
12 else
13     # Clone base environment
14     conda create -p $ML_ENV --clone /soft/datascience/conda/2023-10-04/mconda3/
15     conda activate $ML_ENV
16
17     # Install MPI4Py with GPU support
18     yes | MPICC="cc -shared -target-accel=nvidia80" \
19         pip install --force-reinstall --no-cache-dir --no-binary=mpi4py mpi4py
20
21     # Install pydftracer
22     yes | pip install --no-cache-dir git+https://github.com/hariharan-devarajan/dftracer.git
23
24     # Reinstall torch and horovod
25     pip uninstall -y torch horovod
26     yes | pip install --no-cache-dir horovod
27 fi

Application Instrumentation

Since torchvision.datasets.ImageFolder spawns separate Python processes for parallel data loading, we use hybrid mode (see Hybrid Mode Example) to capture I/O from both the main process and spawned workers.

 1 import os
 2 from dftracer.python import dftracer as logger, dft_fn as dft_event_logging
 3
 4 # Initialize DFTracer
 5 dft_pid = os.getpid()
 6 log_inst = logger.initialize_log(
 7     f"./resnet50/dft_fn_py_level-{dft_pid}.pfw",
 8     "",
 9     dft_pid
10 )
11
12 # Create tracers for different operation types
13 compute_dft = dft_event_logging("Compute")
14 io_dft = dft_event_logging("IO", name="real_IO")
15
16 def train(epoch, model, train_loader, criterion, device):
17     """Training loop with DFTracer instrumentation"""
18
19     # Trace data loading iterations
20     for i, (images, target) in io_dft.iter(enumerate(train_loader)):
21
22         # Trace CPU to GPU transfer
23         with dft_event_logging(
24             "communication-except-io",
25             name="cpu-gpu-transfer",
26             step=i,
27             epoch=epoch
28         ) as transfer:
29             images = images.to(device)
30             target = target.to(device)
31
32         # Trace forward propagation
33         with dft_event_logging(
34             "compute",
35             name="model-compute-forward-prop",
36             step=i,
37             epoch=epoch
38         ) as compute:
39             output = model(images)
40             loss = criterion(output, target)
41
42         # Trace backward propagation
43         with dft_event_logging(
44             "compute",
45             name="model-compute-backward-prop",
46             step=i,
47             epoch=epoch
48         ) as compute:
49             acc1, acc5 = accuracy(output, target, topk=(1, 5))
50             losses.update(loss.item(), images.size(0))
51             top1.update(acc1[0], images.size(0))
52             top5.update(acc5[0], images.size(0))
53
54 def main():
55     # ... model setup and training ...
56
57     # Finalize DFTracer
58     log_inst.finalize()
59
60 if __name__ == "__main__":
61     main()

Job Submission Script

 1 #!/bin/bash
 2
 3 # Load environment
 4 export MODULEPATH=/soft/modulefiles/conda/:$MODULEPATH
 5 module load 2023-10-04
 6 conda activate ./dlio_ml_workloads/PolarisAT/conda-envs/ml_workload_latest_conda
 7
 8 # Set library path
 9 export LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH
10
11 # DFTracer configuration
12 export DFTRACER_LOG_LEVEL=ERROR
13 export DFTRACER_ENABLE=1
14 export DFTRACER_INC_METADATA=1
15 export DFTRACER_INIT=PRELOAD
16
17 # Path to ResNet50 dataset
18 export DFTRACER_DATA_DIR=./resnet_original_data
19
20 # POSIX-level log file
21 export DFTRACER_LOG_FILE=./dft_fn_posix_level.pfw
22
23 # Run with LD_PRELOAD
24 LD_PRELOAD=$CONDA_PREFIX/lib/python*/site-packages/dftracer/lib/libdftracer_preload.so \
25     aprun -n 4 -N 4 python resnet_hvd_dlio.py \
26     --batch-size 64 \
27     --epochs 1 \
28     > dft_fn.log 2>&1
29
30 # Combine all trace files
31 cat *.pfw > combined_logs.pfw

Understanding the Output

This configuration produces two types of trace files:

  1. Python-level traces (dft_fn_py_level-*.pfw): Function-level events from decorators

  2. POSIX-level traces (dft_fn_posix_level-*.pfw): Low-level I/O operations

Combine them using:

cat *.pfw > combined_logs.pfw

Integrated Applications

pydftracer is currently used in production by several applications:

  1. DLIO Benchmark - GitHub

    Comprehensive I/O benchmark for deep learning workloads

  2. MuMMI - Multiscale Machine-learned Modeling Infrastructure

    Large-scale molecular dynamics simulations

  3. ResNet50 Training - PyTorch and torchvision

    Image classification with distributed training

Additional Resources

For more examples and use cases, see: