prismo/tools/scripts/parser.py at main · dsrhaslab/prismo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import re
import pandas as pd
from tqdm import tqdm # type: ignore
from datetime import datetime
from typing import Iterator

LOG_RE = re.compile(
    r'\[(?P<timestamp>.*?)\] '
    r'\[(?P<module>.*?)\] '
    r'\[(?P<level>.*?)\] '
    r'\[(?P<fields>.*)\]'
)

DTYPES = {
    'type': 'uint32',
    'block': 'uint32',
    'cpr': 'uint32',
    'sts': 'int64',
    'ets': 'int64',
    'pid': 'uint32',
    'tid': 'uint64',
    'req': 'uint32',
    'proc': 'uint32',
    'offset': 'uint64',
    'ret': 'int32',
    'errno': 'int32',
}


def iter_prismo_rows(log_filename: str) -> Iterator[dict]:
    total_lines = sum(1 for _ in open(log_filename, 'r'))

    with open(log_filename, 'r') as f:
        for line in tqdm(f, total=total_lines, desc=f'Reading {log_filename}', unit='lines'):
            line = line.strip()
            if not line:
                continue

            m = LOG_RE.match(line)
            if not m:
                continue

            row = {
                'timestamp': datetime.strptime(
                    m.group('timestamp'),
                    '%Y-%m-%d %H:%M:%S.%f'
                ),
                'module': m.group('module'),
                'level': m.group('level'),
            }

            for part in m.group('fields').split():
                k, v = part.split('=')
                if k == 'block':
                    row[k] = int(v, 16)
                else:
                    row[k] = int(v)

            yield row


def get_prismo_entries(
    log_filename: str,
    chunk_size: int = 100_000
) -> pd.DataFrame:
    chunks = []
    batch = []

    for row in iter_prismo_rows(log_filename):
        batch.append(row)
        if len(batch) == chunk_size:
            chunks.append(pd.DataFrame(batch))
            batch.clear()

    if batch:
        chunks.append(pd.DataFrame(batch))

    df = pd.concat(chunks, ignore_index=True)
    df = df.astype(DTYPES)
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    return df