blob: df4ed86ed4c5f048e3ad5cd1782a5a4d16dca5fd [file] [log] [blame]
"""
Utilities for helping with the analysis of data, for reporting purposes.
"""
from lnt.testing import FAIL
from lnt.util import logger
from lnt.util import multidict
from lnt.util import stats
from lnt.util import stats
REGRESSED = 'REGRESSED'
IMPROVED = 'IMPROVED'
UNCHANGED_PASS = 'UNCHANGED_PASS'
UNCHANGED_FAIL = 'UNCHANGED_FAIL'
# The smallest measureable change we can detect in seconds.
MIN_VALUE_PRECISION = 0.0001
# Minimal percentage difference that is visible in reports
MIN_PERCENTAGE_CHANGE = .01
def absmin_diff(current, prevs):
"""Min of differences between current sample and all previous samples.
Given more than one min, use the last one detected which is probably a
newer value. Returns (difference, prev used)
"""
diffs = [abs(current-prev) for prev in prevs]
smallest_pos = 0
smallest = diffs[0]
for i, diff in enumerate(diffs):
if diff <= smallest:
smallest = diff
smallest_pos = i
return current-prevs[smallest_pos], prevs[smallest_pos]
def calc_geomean(run_values):
# NOTE Geometric mean applied only to positive values, so fix it by
# adding MIN_VALUE to each value and substract it from the result.
# Since we are only interested in the change of the central tendency,
# this workaround is good enough.
values = [v + MIN_VALUE_PRECISION for v in run_values]
if not values:
return None
return stats.geometric_mean(values) - MIN_VALUE_PRECISION
class ComparisonResult:
"""A ComparisonResult is ultimatly responsible for determining if a test
improves, regresses or does not change, given some new and old data."""
def __init__(self, aggregation_fn,
cur_failed, prev_failed, samples, prev_samples,
cur_hash, prev_hash, cur_profile=None, prev_profile=None,
confidence_lv=0.05, bigger_is_better=False):
self.aggregation_fn = aggregation_fn
# Special case: if we're using the minimum to aggregate, swap it for
# max if bigger_is_better.
if aggregation_fn == stats.safe_min and bigger_is_better:
aggregation_fn = stats.safe_max
self.cur_hash = cur_hash
self.prev_hash = prev_hash
self.cur_profile = cur_profile
self.prev_profile = prev_profile
if samples:
self.current = aggregation_fn(samples)
else:
self.current = None
self.previous = None
# Compute the comparison status for the test value.
self.delta = 0
self.pct_delta = 0.0
if self.current and prev_samples:
self.delta, value = absmin_diff(self.current, prev_samples)
if value != 0:
self.pct_delta = self.delta / value
self.previous = value
# If we have multiple values for this run, use that to estimate the
# distribution.
#
# We can get integer sample types here - for example if the field is
# .exec.status. Make sure we don't assert by avoiding the stats
# functions in this case.
if samples and len(samples) > 1 and isinstance(samples[0], float):
self.stddev = stats.standard_deviation(samples)
self.MAD = stats.median_absolute_deviation(samples)
else:
self.stddev = None
self.MAD = None
self.failed = cur_failed
self.prev_failed = prev_failed
self.samples = samples
self.prev_samples = prev_samples
self.confidence_lv = confidence_lv
self.bigger_is_better = bigger_is_better
def __repr__(self):
"""Print this ComparisonResult's constructor.
Handy for generating test cases for comparisons doing odd things."""
fmt = "{}(" + "{}, " * 9 + ")"
return fmt.format(self.__class__.__name__,
self.aggregation_fn.__name__,
self.failed,
self.prev_failed,
self.cur_hash,
self.prev_hash,
self.samples,
self.prev_samples,
self.confidence_lv,
bool(self.bigger_is_better))
def __json__(self):
simple_dict = self.__dict__
simple_dict['aggregation_fn'] = self.aggregation_fn.__name__
return simple_dict
def is_result_performance_change(self):
"""Check if we think there was a performance change."""
if self.get_value_status() in (REGRESSED, IMPROVED):
return True
return False
def is_result_interesting(self):
"""is_result_interesting() -> bool
Check whether the result is worth displaying, either because of a
failure, a test status change or a performance change."""
if self.get_test_status() != UNCHANGED_PASS:
return True
if self.get_value_status() in (REGRESSED, IMPROVED):
return True
return False
def get_test_status(self):
# Compute the comparison status for the test success.
if self.failed:
if self.prev_failed:
return UNCHANGED_FAIL
else:
return REGRESSED
else:
if self.prev_failed:
return IMPROVED
else:
return UNCHANGED_PASS
# FIXME: take into account hash of binary - if available. If the hash is
# the same, the binary is the same and therefore the difference cannot be
# significant - for execution time. It can be significant for compile time.
def get_value_status(self, confidence_interval=2.576,
value_precision=MIN_VALUE_PRECISION,
ignore_small=True):
if self.current is None or self.previous is None:
return None
# Don't report value errors for tests which fail, or which just started
# passing.
#
# FIXME: One bug here is that we risk losing performance data on tests
# which flop to failure then back. What would be nice to do here is to
# find the last value in a passing run, or to move to using proper
# keyed reference runs.
if self.failed:
return UNCHANGED_FAIL
elif self.prev_failed:
return UNCHANGED_PASS
# Always ignore percentage changes below MIN_PERCENTAGE_CHANGE %, for now, we just don't
# have enough time to investigate that level of stuff.
if ignore_small and abs(self.pct_delta) < MIN_PERCENTAGE_CHANGE:
return UNCHANGED_PASS
# Always ignore changes with small deltas. There is no mathematical
# basis for this, it should be obviated by appropriate statistical
# checks, but practical evidence indicates what we currently have isn't
# good enough (for reasons I do not yet understand).
if ignore_small and abs(self.delta) < MIN_PERCENTAGE_CHANGE:
return UNCHANGED_PASS
# Ignore tests whose delta is too small relative to the precision we
# can sample at; otherwise quantization means that we can't measure the
# standard deviation with enough accuracy.
if abs(self.delta) <= 2 * value_precision * confidence_interval:
return UNCHANGED_PASS
# Use Mann-Whitney U test to test null hypothesis that result is
# unchanged.
if len(self.samples) >= 4 and len(self.prev_samples) >= 4:
same = stats.mannwhitneyu(self.samples, self.prev_samples,
self.confidence_lv)
if same:
return UNCHANGED_PASS
# If we have a comparison window, then measure using a symmetic
# confidence interval.
if self.stddev is not None:
is_significant = abs(self.delta) > (self.stddev *
confidence_interval)
# If the delta is significant, return
if is_significant:
if self.delta < 0:
return REGRESSED if self.bigger_is_better else IMPROVED
else:
return IMPROVED if self.bigger_is_better else REGRESSED
else:
return UNCHANGED_PASS
# Otherwise, report any changes above 0.2%, which is a rough
# approximation for the smallest change we expect "could" be measured
# accurately.
if not ignore_small or abs(self.pct_delta) >= .002:
if self.pct_delta < 0:
return REGRESSED if self.bigger_is_better else IMPROVED
else:
return IMPROVED if self.bigger_is_better else REGRESSED
else:
return UNCHANGED_PASS
class RunInfo(object):
def __init__(self, session, testsuite, runs_to_load,
aggregation_fn=stats.safe_min, confidence_lv=.05,
only_tests=None):
"""Get all the samples needed to build a CR.
runs_to_load are the run IDs of the runs to get the samples from.
if only_tests is passed, only samples form those test IDs are fetched.
"""
self.testsuite = testsuite
self.aggregation_fn = aggregation_fn
self.confidence_lv = confidence_lv
self.sample_map = multidict.multidict()
self.profile_map = dict()
self.loaded_run_ids = set()
self._load_samples_for_runs(session, runs_to_load, only_tests)
@property
def test_ids(self):
return set(key[1] for key in self.sample_map.keys())
def get_sliding_runs(self, session, run, compare_run,
num_comparison_runs=0):
"""
Get num_comparison_runs most recent runs,
This query is expensive.
"""
runs = [run]
runs_prev = self.testsuite \
.get_previous_runs_on_machine(session, run, num_comparison_runs)
runs += runs_prev
if compare_run is not None:
compare_runs = [compare_run]
comp_prev = self.testsuite \
.get_previous_runs_on_machine(session, compare_run,
num_comparison_runs)
compare_runs += comp_prev
else:
compare_runs = []
return runs, compare_runs
def get_run_comparison_result(self, run, compare_to, test_id, field,
hash_of_binary_field):
if compare_to is not None:
compare_to = [compare_to]
else:
compare_to = []
return self.get_comparison_result([run], compare_to, test_id, field,
hash_of_binary_field)
def get_samples(self, runs, test_id):
all_samples = []
for run in runs:
samples = self.sample_map.get((run.id, test_id))
if samples is not None:
all_samples.extend(samples)
return all_samples
def get_comparison_result(self, runs, compare_runs, test_id, field,
hash_of_binary_field):
# Get the field which indicates the requested field's status.
status_field = field.status_field
# Load the sample data for the current and previous runs and the
# comparison window.
run_samples = self.get_samples(runs, test_id)
prev_samples = self.get_samples(compare_runs, test_id)
cur_profile = prev_profile = None
if runs:
cur_profile = self.profile_map.get((runs[0].id, test_id), None)
if compare_runs:
prev_profile = self.profile_map.get((compare_runs[0].id, test_id),
None)
# Determine whether this (test,pset) passed or failed in the current
# and previous runs.
#
# FIXME: Support XFAILs and non-determinism (mixed fail and pass)
# better.
run_failed = prev_failed = False
if status_field:
status_field_index = self.testsuite.get_field_index(status_field)
for sample in run_samples:
run_failed |= sample[status_field_index] == FAIL
for sample in prev_samples:
prev_failed |= sample[status_field_index] == FAIL
field_index = self.testsuite.get_field_index(field)
# Get the current and previous values.
run_values = [s[field_index] for s in run_samples
if s[field_index] is not None]
prev_values = [s[field_index] for s in prev_samples
if s[field_index] is not None]
if hash_of_binary_field:
hash_of_binary_field_index = \
self.testsuite.get_field_index(hash_of_binary_field)
hash_values = [s[hash_of_binary_field_index] for s in run_samples
if s[hash_of_binary_field_index] is not None]
prev_hash_values = [s[hash_of_binary_field_index]
for s in prev_samples
if s[field_index] is not None]
# All current hash_values should all be the same.
# Warn in the log when the hash wasn't the same for all samples.
cur_hash_set = set(hash_values)
if len(cur_hash_set) > 1:
logger.warning("Found different hashes for multiple samples "
"in the same run {0}: {1}\nTestID:{2}"
.format(runs, hash_values, test_id))
cur_hash = hash_values[0] if len(hash_values) > 0 else None
prev_hash = prev_hash_values[0] \
if len(prev_hash_values) > 0 else None
else:
cur_hash = None
prev_hash = None
r = ComparisonResult(self.aggregation_fn,
run_failed, prev_failed, run_values,
prev_values, cur_hash, prev_hash,
cur_profile, prev_profile,
self.confidence_lv,
bigger_is_better=field.bigger_is_better)
return r
def get_geomean_comparison_result(self, run, compare_to, field, tests):
unchanged_tests = [(cr.previous, cr.current, cr.prev_hash, cr.cur_hash)
for _, _, cr in tests
if cr.get_test_status() == UNCHANGED_PASS]
if unchanged_tests:
prev_values, run_values, prev_hash, cur_hash = zip(
*unchanged_tests)
prev_values = [x for x in prev_values if x is not None]
run_values = [x for x in run_values if x is not None]
prev_hash = [x for x in prev_hash if x is not None]
cur_hash = [x for x in cur_hash if x is not None]
prev_hash = prev_hash[0] if len(prev_hash) > 0 else None
cur_hash = cur_hash[0] if len(cur_hash) > 0 else None
prev_geo = calc_geomean(prev_values)
prev_values = [prev_geo] if prev_geo else []
run_values = [calc_geomean(run_values)]
else:
prev_values, run_values, prev_hash, cur_hash = [], [], None, None
return ComparisonResult(self.aggregation_fn,
cur_failed=not bool(run_values),
prev_failed=not bool(prev_values),
samples=run_values,
prev_samples=prev_values,
cur_hash=cur_hash,
prev_hash=prev_hash,
confidence_lv=0,
bigger_is_better=field.bigger_is_better)
def _load_samples_for_runs(self, session, run_ids, only_tests):
# Find the set of new runs to load.
to_load = set(run_ids) - self.loaded_run_ids
if not to_load:
return
# Batch load all of the samples for the needed runs.
#
# We speed things up considerably by loading the column data directly
# here instead of requiring SA to materialize Sample objects.
columns = [self.testsuite.Sample.run_id,
self.testsuite.Sample.test_id,
self.testsuite.Sample.profile_id]
columns.extend(f.column for f in self.testsuite.sample_fields)
q = session.query(*columns)
if only_tests:
q = q.filter(self.testsuite.Sample.test_id.in_(only_tests))
q = q.filter(self.testsuite.Sample.run_id.in_(to_load))
for data in q:
run_id = data[0]
test_id = data[1]
profile_id = data[2]
sample_values = data[3:]
self.sample_map[(run_id, test_id)] = sample_values
if profile_id is not None:
self.profile_map[(run_id, test_id)] = profile_id
self.loaded_run_ids |= to_load