blob: 37bc39451078456e43e4cc19df812491e0c0f266 [file] [log] [blame]
#!/usr/bin/env python
"""Tool to filter, organize, compare and display benchmarking results. Usefull
for smaller datasets. It works great with a few dozen runs it is not designed to
deal with hundreds.
Requires the pandas library to be installed."""
from __future__ import print_function
import pandas as pd
from scipy import stats
import sys
import os.path
import re
import numbers
import argparse
GEOMEAN_ROW = 'Geomean difference'
def read_lit_json(filename):
import json
jsondata = json.load(open(filename))
columns = []
columnindexes = {}
names = set()
info_columns = ['hash']
# Pass1: Figure out metrics (= the column index)
if 'tests' not in jsondata:
print("%s: Could not find toplevel 'tests' key")
for test in jsondata['tests']:
name = test.get("name")
if name is None:
sys.stderr.write("Error: Found unnamed test\n" % name)
if name in names:
sys.stderr.write("Error: Multiple tests with name '%s'\n" % name)
if "metrics" not in test:
print("Warning: '%s' has no metrics, skipping!" % test['name'])
for name in test["metrics"].keys():
if name not in columnindexes:
columnindexes[name] = len(columns)
for name in test.keys():
if name not in columnindexes and name in info_columns:
columnindexes[name] = len(columns)
# Pass2 actual data construction
nan = float('NaN')
data = []
testnames = []
for test in jsondata['tests']:
if "metrics" not in test:
name = test['name']
if 'shortname' in test:
name = test['shortname']
datarow = [nan] * len(columns)
for (metricname, value) in test['metrics'].items():
datarow[columnindexes[metricname]] = value
for (name, value) in test.items():
index = columnindexes.get(name)
if index is not None:
datarow[index] = test[name]
index = pd.Index(testnames, name='Program')
return pd.DataFrame(data=data, index=index, columns=columns)
def read_report_simple_csv(filename):
return pd.read_csv(filename, na_values=['*'], index_col=0, header=0)
def read(name):
if name.endswith(".json"):
return read_lit_json(name)
if name.endswith(".csv"):
return read_report_simple_csv(name)
raise Exception("Cannot determine file format");
def readmulti(filenames):
# Read datasets
datasetnames = []
datasets = []
prev_index = None
for filename in filenames:
data = read(filename)
name = os.path.basename(filename)
# drop .json/.csv suffix; TODO: Should we rather do this in the printing
# logic?
for ext in ['.csv', '.json']:
if name.endswith(ext):
name = name[:-len(ext)]
suffix = ""
count = 0
while True:
if name+suffix not in datasetnames:
suffix = str(count)
count +=1
# Warn if index names are different
if prev_index is not None and !=
sys.stderr.write("Warning: Mismatched index names: '%s' vs '%s'\n"
% (,
prev_index = data.index
# Merge datasets
d = pd.concat(datasets, axis=0, names=['run'], keys=datasetnames)
return d
def get_values(values):
# Create data view without diff column.
if 'diff' in values.columns:
values = values[[c for c in values.columns if c != 'diff']]
has_two_runs = len(values.columns) == 2
if has_two_runs:
return (values.iloc[:,0], values.iloc[:,1])
return (values.min(axis=1), values.max(axis=1))
def add_diff_column(metric, values, absolute_diff=False):
values0, values1 = get_values(values[metric])
values0.fillna(0.0, inplace=True)
values1.fillna(0.0, inplace=True)
# Quotient or absolute difference?
if absolute_diff:
values[(metric, 'diff')] = values1 - values0
values[(metric, 'diff')] = (values1 / values0) - 1.0
return values
def add_geomean_row(metrics, data, dataout):
Normalize values1 over values0, compute geomean difference and add a
summary row to dataout.
gm = pd.DataFrame(index=[GEOMEAN_ROW], columns=dataout.columns,
for metric in metrics:
values0, values1 = get_values(data[metric])
# Avoid infinite values in the diff and instead use NaN, as otherwise
# the computation of the geometric mean will fail.
values0 = values0.replace({0: float('NaN')})
relative = values1 / values0
gm_diff = stats.gmean(relative.dropna()) - 1.0
gm[(metric, 'diff')] = gm_diff
gm.Program = GEOMEAN_ROW
return pd.concat([dataout, gm])
def filter_failed(data, key='Exec'):
return data.loc[data[key] == "pass"]
def filter_short(data, threshold, key='Exec_Time'):
return data.loc[data[key] >= threshold]
def filter_same_hash(data, key='hash'):
assert key in data.columns
assert data.index.get_level_values(0).nunique() > 1
return data.groupby(level=1).filter(lambda x: x[key].nunique() != 1)
def filter_blacklist(data, blacklist):
return data.loc[~(data.index.get_level_values(1).isin(blacklist))]
def print_filter_stats(reason, before, after):
n_before = len(before.groupby(level=1))
n_after = len(after.groupby(level=1))
n_filtered = n_before - n_after
if n_filtered != 0:
print("%s: %s (filtered out)" % (reason, n_filtered))
# Truncate a string to a maximum length by keeping a prefix, a suffix and ...
# in the middle
def truncate(string, prefix_len, suffix_len):
return re.sub("^(.{%d}).*(.{%d})$" % (prefix_len, suffix_len),
"\g<1>...\g<2>", string)
# Search for common prefixes and suffixes in a list of names and return
# a (prefix,suffix) tuple that specifies how many characters can be dropped
# for the prefix/suffix. The numbers will be small enough that no name will
# become shorter than min_len characters.
def determine_common_prefix_suffix(names, min_len=8):
if len(names) <= 1:
return (0,0)
name0 = names[0]
prefix = name0
prefix_len = len(name0)
suffix = name0
suffix_len = len(name0)
shortest_name = len(name0)
for name in names:
if len(name) < shortest_name:
shortest_name = len(name)
while prefix_len > 0 and name[:prefix_len] != prefix:
prefix_len -= 1
prefix = name0[:prefix_len]
while suffix_len > 0 and name[-suffix_len:] != suffix:
suffix_len -= 1
suffix = name0[-suffix_len:]
if suffix[0] != '.' and suffix[0] != '_':
suffix_len = 0
suffix_len = max(0, min(shortest_name - prefix_len - min_len, suffix_len))
prefix_len = max(0, min(shortest_name - suffix_len, prefix_len))
return (prefix_len, suffix_len)
def format_relative_diff(value):
if not isinstance(value, numbers.Integral):
return "%4.1f%%" % (value * 100.)
return "%-5d" % value
def print_result(d, limit_output=True, shorten_names=True, minimal_names=False,
show_diff_column=True, sortkey='diff', sort_by_abs=True,
metrics = d.columns.levels[0]
if sort_by_abs:
d = d.sort_values(by=(metrics[0], sortkey), key=pd.Series.abs, ascending=False)
d = d.sort_values(by=(metrics[0], sortkey), ascending=False)
# Ensure that the columns are grouped by metric (rather than having the
# diffs at the end of the line).
d = d.reindex(columns=d.columns.levels[0], level=0)
if not show_diff_column:
# Remove all diff columns (using level=1 since level 0 is the metric).
d.drop(labels='diff', level=1, axis=1, inplace=True)
dataout = d
if limit_output:
# Take 15 topmost elements
dataout = dataout.head(15)
formatters = dict()
if not absolute_diff:
for m in metrics:
formatters[(m, 'diff')] = format_relative_diff
# Turn index into a column so we can format it...
formatted_program = dataout.index.to_series()
if shorten_names:
def format_name(name, common_prefix, common_suffix):
name = name[common_prefix:]
if common_suffix > 0:
name = name[:-common_suffix]
return "%-45s" % truncate(name, 10, 30)
def strip_name_fully(name):
name = name.split('/')[-1]
if name.endswith('.test'):
name = name[:-5]
return name
# The to_string formatters argument appears to be ignored for
# dtype=object, so transform the program column manually.
if minimal_names:
formatted_program =
drop_prefix, drop_suffix = determine_common_prefix_suffix(formatted_program)
formatted_program = name: format_name(name, drop_prefix, drop_suffix))
dataout.insert(0, 'Program', formatted_program)
# Add the geometric mean row after we have formatted the program names
# as it will otherwise interfere with common prefix/suffix computation.
if show_diff_column and not absolute_diff:
# geometric mean only makes sense for relative differences.
dataout = add_geomean_row(metrics, d, dataout)
def float_format(x):
if x == '':
return ''
return "%6.2f" % (x,)
pd.set_option("display.max_colwidth", 0)
pd.set_option('display.width', 0)
# Print an empty value instead of NaN (for the geomean row).
out = dataout.to_string(index=False, justify='left', na_rep='',
float_format=float_format, formatters=formatters)
def main():
parser = argparse.ArgumentParser(prog='')
parser.add_argument('-a', '--all', action='store_true')
parser.add_argument('-f', '--full', action='store_true')
parser.add_argument('-m', '--metric', action='append', dest='metrics',
parser.add_argument('--nodiff', action='store_false', dest='show_diff',
parser.add_argument('--diff', action='store_true', dest='show_diff')
parser.add_argument('--absolute-diff', action='store_true',
help='Use an absolute instead of a relative difference')
parser.add_argument('--filter-short', nargs='?',
dest='filter_short', default=None,
help="Filter benchmarks with execution times less than N seconds (default 1.0s)")
parser.add_argument('--no-filter-failed', action='store_false',
dest='filter_failed', default=True)
parser.add_argument('--filter-hash', action='store_true',
dest='filter_hash', default=False)
dest='filter_blacklist', default=None)
parser.add_argument('--merge-average', action='store_const',
dest='merge_function', const=pd.DataFrame.mean,
parser.add_argument('--merge-min', action='store_const',
dest='merge_function', const=pd.DataFrame.min)
parser.add_argument('--merge-max', action='store_const',
dest='merge_function', const=pd.DataFrame.max)
parser.add_argument('--lhs-name', default="lhs",
help="Name used to describe left side in 'vs' mode")
parser.add_argument('--rhs-name', default="rhs",
help="Name used to describe right side in 'vs' mode")
parser.add_argument('files', metavar='FILE', nargs='+', help="To compare two groups of results, put 'vs' between them")
parser.add_argument('--minimal-names', action='store_true',
dest='minimal_names', default=False)
parser.add_argument('--no-abs-sort', action='store_true',
dest='no_abs_sort', default=False, help="Don't use abs() when sorting results")
config = parser.parse_args()
if config.show_diff is None:
config.show_diff = len(config.files) > 1
# If only --filter-short is provided, i.e. its optional argument is
# omitted, we default to threshold of 1 second to filter out apps and
# results with a execution time less than that.
filter_short_threshold = 1.0
# If the optional argument to --filter-short is omitted, we need to take
# care of this case and command line:
# --filter-short FILE [FILE ...]
# I.e., we need to recognise that FILE is not the optional argument to
# --filter-short. The way we do this, is to try converting the option value
# to a float, and if that fails, we insert it back into the files list (in
# the first position).
if config.filter_short is not None:
filter_short_threshold = float(config.filter_short)
config.files.insert(0, config.filter_short)
# Read inputs
files = config.files
if "vs" in files:
split = files.index("vs")
lhs = files[0:split]
rhs = files[split+1:]
# Filter minimum of lhs and rhs
lhs_d = readmulti(lhs)
lhs_merged = lhs_d.groupby(level=1).apply(config.merge_function)
rhs_d = readmulti(rhs)
rhs_merged = rhs_d.groupby(level=1).apply(config.merge_function)
# Combine to new dataframe
data = pd.concat([lhs_merged, rhs_merged], names=['l/r'],
keys=[config.lhs_name, config.rhs_name])
data = readmulti(files)
# Decide which metric to display / what is our "main" metric
metrics = config.metrics
if len(metrics) == 0:
defaults = [ 'Exec_Time', 'exec_time', 'Value', 'Runtime' ]
for defkey in defaults:
if defkey in data.columns:
metrics = [defkey]
if len(metrics) == 0:
sys.stderr.write("No default metric found and none specified\n")
sys.stderr.write("Available metrics:\n")
for column in data.columns:
sys.stderr.write("\t%s\n" % column)
for metric in metrics:
problem = False
if metric not in data.columns:
sys.stderr.write("Unknown metric '%s'\n" % metric)
problem = True
if problem:
# Filter data
proggroup = data.groupby(level=1)
initial_size = len(proggroup.indices)
print("Tests: %s" % (initial_size,))
if config.filter_failed and hasattr(data, 'Exec'):
newdata = filter_failed(data)
print_filter_stats("Failed", data, newdata)
newdata = newdata.drop('Exec', 1)
data = newdata
if config.filter_short:
newdata = filter_short(data, filter_short_threshold, metric)
print_filter_stats("Short Running", data, newdata)
data = newdata
if config.filter_hash and 'hash' in data.columns and \
data.index.get_level_values(0).nunique() > 1:
newdata = filter_same_hash(data)
print_filter_stats("Same hash", data, newdata)
data = newdata
if config.filter_blacklist:
blacklist = open(config.filter_blacklist).readlines()
blacklist = [line.strip() for line in blacklist]
newdata = filter_blacklist(data, blacklist)
print_filter_stats("In Blacklist", data, newdata)
data = newdata
final_size = len(data.groupby(level=1))
if final_size != initial_size:
print("Remaining: %d" % (final_size,))
# Reduce / add columns
print("Metric: %s" % (",".join(metrics),))
if len(metrics) > 0:
data = data[metrics]
data = data.unstack(level=0)
for metric in data.columns.levels[0]:
data = add_diff_column(metric, data, absolute_diff=config.absolute_diff)
sortkey = 'diff'
# TODO: should we still be sorting by diff even if the diff is hidden?
if len(config.files) == 1:
sortkey = data.columns.levels[1][0]
# Print data
shorten_names = not config.full
limit_output = (not config.all) and (not config.full)
print_result(data, limit_output, shorten_names, config.minimal_names,
config.show_diff, sortkey, config.no_abs_sort,
if __name__ == "__main__":