RT-DETR/rtdetrv2_pytorch/tools/run_profile.py at main · lyuwenyu/RT-DETR · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
"""

import math
import os
import sys

import torch
import torch.nn as nn
from torch import Tensor

sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
from typing import Any, Dict, List, Optional

from src.core import YAMLConfig, yaml_utils

__all__ = ["profile_stats"]

def _auto_scale_flops(flops: float):
    """Copied from torch.profiler.profile"""
    flop_headers = [
        "",
        "K",
        "M",
        "G",
        "T",
        "P",
    ]
    assert flops > 0
    log_flops = max(0, min(math.log10(flops) / 3, float(len(flop_headers) - 1)))
    assert log_flops >= 0 and log_flops < len(flop_headers)
    return (pow(10, (math.floor(log_flops) * -3.0)), flop_headers[int(log_flops)])

def profile_stats(
    model: nn.Module,
    data: Optional[Tensor]=None,
    shape: List[int]=[1, 3, 640, 640],
    verbose: bool=False
) -> Dict[str, Any]:
    is_training = model.training

    model.train()
    num_params = sum([p.numel() for p in model.parameters() if p.requires_grad])

    model.eval()

    if data is None:
        dtype = next(model.parameters()).dtype
        device = next(model.parameters()).device
        data = torch.rand(*shape, dtype=dtype, device=device)
        print(device)

    def trace_handler(prof):
        print(prof.key_averages().table(sort_by='self_cuda_time_total', row_limit=-1))

    wait = 0
    warmup = 1
    active = 1
    repeat = 1
    skip_first = 0
    with torch.profiler.profile(
        activities=[
            torch.profiler.ProfilerActivity.CPU,
            torch.profiler.ProfilerActivity.CUDA,
        ],
        schedule=torch.profiler.schedule(
            wait=wait,
            warmup=warmup,
            active=active,
            repeat=repeat,
            skip_first=skip_first,
        ),
        with_flops=True,
    ) as p:
        n_step = skip_first + (wait + warmup + active) * repeat
        for _ in range(n_step):
            _ = model(data)
            p.step()

    if is_training:
        model.train()

    statistics = p.key_averages()
    info = statistics.table(sort_by='self_cuda_time_total', row_limit=-1)
    num_flops = sum(event.flops for event in statistics if event.flops > 0) / active
    (flops_scale, flops_header) = _auto_scale_flops(num_flops)

    if verbose:
        print(info)
        print(f'Total number of trainable parameters: {num_params}')
        print(f'Total number of flops: {num_flops * flops_scale:.3f}{flops_header} with {shape}')

    return {'n_parameters': num_params, 'n_flops': num_flops, 'info': info}


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--config', type=str, required=True)
    parser.add_argument('-d', '--device', type=str, default='cuda:0', help='device',)
    parser.add_argument('-u', '--update', nargs='+', help='Update yaml config from command line.')
    args = parser.parse_args()

    update_dict = yaml_utils.parse_cli(args.update) if args.update else {}
    update_dict.update({k: v for k, v in args.__dict__.items() \
                        if k not in ['update', ] and v is not None})
    cfg = YAMLConfig(args.config, **update_dict)
    model = cfg.model.to(args.device)

    profile_stats(model, verbose=True)