lnt/server/reporting/analysis.py - llvm-lnt - Git at Google

 """
 Utilities for helping with the analysis of data, for reporting purposes.
 """
 from lnt.testing import FAIL
 from lnt.util import logger
 from lnt.util import multidict
 from lnt.util import stats
 from lnt.util import stats

 REGRESSED = 'REGRESSED'
 IMPROVED = 'IMPROVED'
 UNCHANGED_PASS = 'UNCHANGED_PASS'
 UNCHANGED_FAIL = 'UNCHANGED_FAIL'

 # The smallest measureable change we can detect in seconds.
 MIN_VALUE_PRECISION = 0.0001

 # Minimal percentage difference that is visible in reports
 MIN_PERCENTAGE_CHANGE = .01

 def absmin_diff(current, prevs):
     """Min of differences between current sample and all previous samples.
     Given more than one min, use the last one detected which is probably a
     newer value. Returns (difference, prev used)
     """
     diffs = [abs(current-prev) for prev in prevs]
     smallest_pos = 0
     smallest = diffs[0]
     for i, diff in enumerate(diffs):
         if diff <= smallest:
             smallest = diff
             smallest_pos = i
     return current-prevs[smallest_pos], prevs[smallest_pos]


 def calc_geomean(run_values):
     # NOTE Geometric mean applied only to positive values, so fix it by
     # adding MIN_VALUE to each value and substract it from the result.
     # Since we are only interested in the change of the central tendency,
     # this workaround is good enough.

     values = [v + MIN_VALUE_PRECISION for v in run_values]

     if not values:
         return None

     return stats.geometric_mean(values) - MIN_VALUE_PRECISION


 class ComparisonResult:
     """A ComparisonResult is ultimatly responsible for determining if a test
     improves, regresses or does not change, given some new and old data."""

     def __init__(self, aggregation_fn,
                  cur_failed, prev_failed, samples, prev_samples,
                  cur_hash, prev_hash, cur_profile=None, prev_profile=None,
                  confidence_lv=0.05, bigger_is_better=False):
         self.aggregation_fn = aggregation_fn

         # Special case: if we're using the minimum to aggregate, swap it for
         # max if bigger_is_better.
         if aggregation_fn == stats.safe_min and bigger_is_better:
             aggregation_fn = stats.safe_max

         self.cur_hash = cur_hash
         self.prev_hash = prev_hash
         self.cur_profile = cur_profile
         self.prev_profile = prev_profile

         if samples:
             self.current = aggregation_fn(samples)
         else:
             self.current = None

         self.previous = None

         # Compute the comparison status for the test value.
         self.delta = 0
         self.pct_delta = 0.0
         if self.current and prev_samples:
             self.delta, value = absmin_diff(self.current, prev_samples)
             if value != 0:
                 self.pct_delta = self.delta / value
             self.previous = value

         # If we have multiple values for this run, use that to estimate the
         # distribution.
         #
         # We can get integer sample types here - for example if the field is
         # .exec.status. Make sure we don't assert by avoiding the stats
         # functions in this case.
         if samples and len(samples) > 1 and isinstance(samples[0], float):
             self.stddev = stats.standard_deviation(samples)
             self.MAD = stats.median_absolute_deviation(samples)
         else:
             self.stddev = None
             self.MAD = None

         self.failed = cur_failed
         self.prev_failed = prev_failed
         self.samples = samples
         self.prev_samples = prev_samples

         self.confidence_lv = confidence_lv
         self.bigger_is_better = bigger_is_better

     def __repr__(self):
         """Print this ComparisonResult's constructor.

         Handy for generating test cases for comparisons doing odd things."""
         fmt = "{}(" + "{}, " * 9 + ")"
         return fmt.format(self.__class__.__name__,
                           self.aggregation_fn.__name__,
                           self.failed,
                           self.prev_failed,
                           self.cur_hash,
                           self.prev_hash,
                           self.samples,
                           self.prev_samples,
                           self.confidence_lv,
                           bool(self.bigger_is_better))

     def __json__(self):
         simple_dict = self.__dict__
         simple_dict['aggregation_fn'] = self.aggregation_fn.__name__
         return simple_dict

     def is_result_performance_change(self):
         """Check if we think there was a performance change."""
         if self.get_value_status() in (REGRESSED, IMPROVED):
             return True
         return False

     def is_result_interesting(self):
         """is_result_interesting() -> bool

         Check whether the result is worth displaying, either because of a
         failure, a test status change or a performance change."""
         if self.get_test_status() != UNCHANGED_PASS:
             return True
         if self.get_value_status() in (REGRESSED, IMPROVED):
             return True
         return False

     def get_test_status(self):
         # Compute the comparison status for the test success.
         if self.failed:
             if self.prev_failed:
                 return UNCHANGED_FAIL
             else:
                 return REGRESSED
         else:
             if self.prev_failed:
                 return IMPROVED
             else:
                 return UNCHANGED_PASS

     # FIXME: take into account hash of binary - if available. If the hash is
     # the same, the binary is the same and therefore the difference cannot be
     # significant - for execution time. It can be significant for compile time.
     def get_value_status(self, confidence_interval=2.576,
                          value_precision=MIN_VALUE_PRECISION,
                          ignore_small=True):
         if self.current is None or self.previous is None:
             return None

         # Don't report value errors for tests which fail, or which just started
         # passing.
         #
         # FIXME: One bug here is that we risk losing performance data on tests
         # which flop to failure then back. What would be nice to do here is to
         # find the last value in a passing run, or to move to using proper
         # keyed reference runs.
         if self.failed:
             return UNCHANGED_FAIL
         elif self.prev_failed:
             return UNCHANGED_PASS

         # Always ignore percentage changes below MIN_PERCENTAGE_CHANGE %, for now, we just don't
         # have enough time to investigate that level of stuff.
         if ignore_small and abs(self.pct_delta) < MIN_PERCENTAGE_CHANGE:
             return UNCHANGED_PASS

         # Always ignore changes with small deltas. There is no mathematical
         # basis for this, it should be obviated by appropriate statistical
         # checks, but practical evidence indicates what we currently have isn't
         # good enough (for reasons I do not yet understand).
         if ignore_small and abs(self.delta) < MIN_PERCENTAGE_CHANGE:
             return UNCHANGED_PASS

         # Ignore tests whose delta is too small relative to the precision we
         # can sample at; otherwise quantization means that we can't measure the
         # standard deviation with enough accuracy.
         if abs(self.delta) <= 2 * value_precision * confidence_interval:
             return UNCHANGED_PASS

         # Use Mann-Whitney U test to test null hypothesis that result is
         # unchanged.
         if len(self.samples) >= 4 and len(self.prev_samples) >= 4:
             same = stats.mannwhitneyu(self.samples, self.prev_samples,
                                       self.confidence_lv)
             if same:
                 return UNCHANGED_PASS

         # If we have a comparison window, then measure using a symmetic
         # confidence interval.
         if self.stddev is not None:
             is_significant = abs(self.delta) > (self.stddev *
                                                 confidence_interval)

             # If the delta is significant, return
             if is_significant:
                 if self.delta < 0:
                     return REGRESSED if self.bigger_is_better else IMPROVED
                 else:
                     return IMPROVED if self.bigger_is_better else REGRESSED
             else:
                 return UNCHANGED_PASS

         # Otherwise, report any changes above 0.2%, which is a rough
         # approximation for the smallest change we expect "could" be measured
         # accurately.
         if not ignore_small or abs(self.pct_delta) >= .002:
             if self.pct_delta < 0:
                 return REGRESSED if self.bigger_is_better else IMPROVED
             else:
                 return IMPROVED if self.bigger_is_better else REGRESSED
         else:
             return UNCHANGED_PASS


 class RunInfo(object):
     def __init__(self, session, testsuite, runs_to_load,
                  aggregation_fn=stats.safe_min, confidence_lv=.05,
                  only_tests=None):
         """Get all the samples needed to build a CR.
         runs_to_load are the run IDs of the runs to get the samples from.
         if only_tests is passed, only samples form those test IDs are fetched.
         """
         self.testsuite = testsuite
         self.aggregation_fn = aggregation_fn
         self.confidence_lv = confidence_lv

         self.sample_map = multidict.multidict()
         self.profile_map = dict()
         self.loaded_run_ids = set()

         self._load_samples_for_runs(session, runs_to_load, only_tests)

     @property
     def test_ids(self):
         return set(key[1] for key in self.sample_map.keys())

     def get_sliding_runs(self, session, run, compare_run,
                          num_comparison_runs=0):
         """
         Get num_comparison_runs most recent runs,
         This query is expensive.
         """
         runs = [run]
         runs_prev = self.testsuite \
             .get_previous_runs_on_machine(session, run, num_comparison_runs)
         runs += runs_prev

         if compare_run is not None:
             compare_runs = [compare_run]
             comp_prev = self.testsuite \
                 .get_previous_runs_on_machine(session, compare_run,
                                               num_comparison_runs)
             compare_runs += comp_prev
         else:
             compare_runs = []

         return runs, compare_runs

     def get_run_comparison_result(self, run, compare_to, test_id, field,
                                   hash_of_binary_field):
         if compare_to is not None:
             compare_to = [compare_to]
         else:
             compare_to = []
         return self.get_comparison_result([run], compare_to, test_id, field,
                                           hash_of_binary_field)

     def get_samples(self, runs, test_id):
         all_samples = []
         for run in runs:
             samples = self.sample_map.get((run.id, test_id))
             if samples is not None:
                 all_samples.extend(samples)
         return all_samples

     def get_comparison_result(self, runs, compare_runs, test_id, field,
                               hash_of_binary_field):
         # Get the field which indicates the requested field's status.
         status_field = field.status_field

         # Load the sample data for the current and previous runs and the
         # comparison window.
         run_samples = self.get_samples(runs, test_id)
         prev_samples = self.get_samples(compare_runs, test_id)

         cur_profile = prev_profile = None
         if runs:
             cur_profile = self.profile_map.get((runs[0].id, test_id), None)
         if compare_runs:
             prev_profile = self.profile_map.get((compare_runs[0].id, test_id),
                                                 None)

         # Determine whether this (test,pset) passed or failed in the current
         # and previous runs.
         #
         # FIXME: Support XFAILs and non-determinism (mixed fail and pass)
         # better.
         run_failed = prev_failed = False
         if status_field:
             status_field_index = self.testsuite.get_field_index(status_field)
             for sample in run_samples:
                 run_failed |= sample[status_field_index] == FAIL
             for sample in prev_samples:
                 prev_failed |= sample[status_field_index] == FAIL

         field_index = self.testsuite.get_field_index(field)

         # Get the current and previous values.
         run_values = [s[field_index] for s in run_samples
                       if s[field_index] is not None]
         prev_values = [s[field_index] for s in prev_samples
                        if s[field_index] is not None]
         if hash_of_binary_field:
             hash_of_binary_field_index = \
                     self.testsuite.get_field_index(hash_of_binary_field)
             hash_values = [s[hash_of_binary_field_index] for s in run_samples
                            if s[hash_of_binary_field_index] is not None]
             prev_hash_values = [s[hash_of_binary_field_index]
                                 for s in prev_samples
                                 if s[field_index] is not None]

             # All current hash_values should all be the same.
             # Warn in the log when the hash wasn't the same for all samples.
             cur_hash_set = set(hash_values)
             if len(cur_hash_set) > 1:
                 logger.warning("Found different hashes for multiple samples "
                                "in the same run {0}: {1}\nTestID:{2}"
                                .format(runs, hash_values, test_id))

             cur_hash = hash_values[0] if len(hash_values) > 0 else None
             prev_hash = prev_hash_values[0] \
                 if len(prev_hash_values) > 0 else None
         else:
             cur_hash = None
             prev_hash = None
         r = ComparisonResult(self.aggregation_fn,
                              run_failed, prev_failed, run_values,
                              prev_values, cur_hash, prev_hash,
                              cur_profile, prev_profile,
                              self.confidence_lv,
                              bigger_is_better=field.bigger_is_better)
         return r

     def get_geomean_comparison_result(self, run, compare_to, field, tests):
         unchanged_tests = [(cr.previous, cr.current, cr.prev_hash, cr.cur_hash)
                            for _, _, cr in tests
                            if cr.get_test_status() == UNCHANGED_PASS]
         if unchanged_tests:
             prev_values, run_values, prev_hash, cur_hash = zip(
                 *unchanged_tests)
             prev_values = [x for x in prev_values if x is not None]
             run_values = [x for x in run_values if x is not None]
             prev_hash = [x for x in prev_hash if x is not None]
             cur_hash = [x for x in cur_hash if x is not None]
             prev_hash = prev_hash[0] if len(prev_hash) > 0 else None
             cur_hash = cur_hash[0] if len(cur_hash) > 0 else None
             prev_geo = calc_geomean(prev_values)
             prev_values = [prev_geo] if prev_geo else []
             run_values = [calc_geomean(run_values)]
         else:
             prev_values, run_values, prev_hash, cur_hash = [], [], None, None

         return ComparisonResult(self.aggregation_fn,
                                 cur_failed=not bool(run_values),
                                 prev_failed=not bool(prev_values),
                                 samples=run_values,
                                 prev_samples=prev_values,
                                 cur_hash=cur_hash,
                                 prev_hash=prev_hash,
                                 confidence_lv=0,
                                 bigger_is_better=field.bigger_is_better)

     def _load_samples_for_runs(self, session, run_ids, only_tests):
         # Find the set of new runs to load.
         to_load = set(run_ids) - self.loaded_run_ids
         if not to_load:
             return

         # Batch load all of the samples for the needed runs.
         #
         # We speed things up considerably by loading the column data directly
         # here instead of requiring SA to materialize Sample objects.
         columns = [self.testsuite.Sample.run_id,
                    self.testsuite.Sample.test_id,
                    self.testsuite.Sample.profile_id]
         columns.extend(f.column for f in self.testsuite.sample_fields)
         q = session.query(*columns)
         if only_tests:
             q = q.filter(self.testsuite.Sample.test_id.in_(only_tests))
         q = q.filter(self.testsuite.Sample.run_id.in_(to_load))
         for data in q:
             run_id = data[0]
             test_id = data[1]
             profile_id = data[2]
             sample_values = data[3:]
             self.sample_map[(run_id, test_id)] = sample_values
             if profile_id is not None:
                 self.profile_map[(run_id, test_id)] = profile_id

         self.loaded_run_ids |= to_load
	"""
	Utilities for helping with the analysis of data, for reporting purposes.
	"""
	from lnt.testing import FAIL
	from lnt.util import logger
	from lnt.util import multidict
	from lnt.util import stats
	from lnt.util import stats

	REGRESSED = 'REGRESSED'
	IMPROVED = 'IMPROVED'
	UNCHANGED_PASS = 'UNCHANGED_PASS'
	UNCHANGED_FAIL = 'UNCHANGED_FAIL'

	# The smallest measureable change we can detect in seconds.
	MIN_VALUE_PRECISION = 0.0001

	# Minimal percentage difference that is visible in reports
	MIN_PERCENTAGE_CHANGE = .01

	def absmin_diff(current, prevs):
	"""Min of differences between current sample and all previous samples.
	Given more than one min, use the last one detected which is probably a
	newer value. Returns (difference, prev used)
	"""
	diffs = [abs(current-prev) for prev in prevs]
	smallest_pos = 0
	smallest = diffs[0]
	for i, diff in enumerate(diffs):
	if diff <= smallest:
	smallest = diff
	smallest_pos = i
	return current-prevs[smallest_pos], prevs[smallest_pos]


	def calc_geomean(run_values):
	# NOTE Geometric mean applied only to positive values, so fix it by
	# adding MIN_VALUE to each value and substract it from the result.
	# Since we are only interested in the change of the central tendency,
	# this workaround is good enough.

	values = [v + MIN_VALUE_PRECISION for v in run_values]

	if not values:
	return None

	return stats.geometric_mean(values) - MIN_VALUE_PRECISION


	class ComparisonResult:
	"""A ComparisonResult is ultimatly responsible for determining if a test
	improves, regresses or does not change, given some new and old data."""

	def __init__(self, aggregation_fn,
	cur_failed, prev_failed, samples, prev_samples,
	cur_hash, prev_hash, cur_profile=None, prev_profile=None,
	confidence_lv=0.05, bigger_is_better=False):
	self.aggregation_fn = aggregation_fn

	# Special case: if we're using the minimum to aggregate, swap it for
	# max if bigger_is_better.
	if aggregation_fn == stats.safe_min and bigger_is_better:
	aggregation_fn = stats.safe_max

	self.cur_hash = cur_hash
	self.prev_hash = prev_hash
	self.cur_profile = cur_profile
	self.prev_profile = prev_profile

	if samples:
	self.current = aggregation_fn(samples)
	else:
	self.current = None

	self.previous = None

	# Compute the comparison status for the test value.
	self.delta = 0
	self.pct_delta = 0.0
	if self.current and prev_samples:
	self.delta, value = absmin_diff(self.current, prev_samples)
	if value != 0:
	self.pct_delta = self.delta / value
	self.previous = value

	# If we have multiple values for this run, use that to estimate the
	# distribution.
	#
	# We can get integer sample types here - for example if the field is
	# .exec.status. Make sure we don't assert by avoiding the stats
	# functions in this case.
	if samples and len(samples) > 1 and isinstance(samples[0], float):
	self.stddev = stats.standard_deviation(samples)
	self.MAD = stats.median_absolute_deviation(samples)
	else:
	self.stddev = None
	self.MAD = None

	self.failed = cur_failed
	self.prev_failed = prev_failed
	self.samples = samples
	self.prev_samples = prev_samples

	self.confidence_lv = confidence_lv
	self.bigger_is_better = bigger_is_better

	def __repr__(self):
	"""Print this ComparisonResult's constructor.

	Handy for generating test cases for comparisons doing odd things."""
	fmt = "{}(" + "{}, " * 9 + ")"
	return fmt.format(self.__class__.__name__,
	self.aggregation_fn.__name__,
	self.failed,
	self.prev_failed,
	self.cur_hash,
	self.prev_hash,
	self.samples,
	self.prev_samples,
	self.confidence_lv,
	bool(self.bigger_is_better))

	def __json__(self):
	simple_dict = self.__dict__
	simple_dict['aggregation_fn'] = self.aggregation_fn.__name__
	return simple_dict

	def is_result_performance_change(self):
	"""Check if we think there was a performance change."""
	if self.get_value_status() in (REGRESSED, IMPROVED):
	return True
	return False

	def is_result_interesting(self):
	"""is_result_interesting() -> bool

	Check whether the result is worth displaying, either because of a
	failure, a test status change or a performance change."""
	if self.get_test_status() != UNCHANGED_PASS:
	return True
	if self.get_value_status() in (REGRESSED, IMPROVED):
	return True
	return False

	def get_test_status(self):
	# Compute the comparison status for the test success.
	if self.failed:
	if self.prev_failed:
	return UNCHANGED_FAIL
	else:
	return REGRESSED
	else:
	if self.prev_failed:
	return IMPROVED
	else:
	return UNCHANGED_PASS

	# FIXME: take into account hash of binary - if available. If the hash is
	# the same, the binary is the same and therefore the difference cannot be
	# significant - for execution time. It can be significant for compile time.
	def get_value_status(self, confidence_interval=2.576,
	value_precision=MIN_VALUE_PRECISION,
	ignore_small=True):
	if self.current is None or self.previous is None:
	return None

	# Don't report value errors for tests which fail, or which just started
	# passing.
	#
	# FIXME: One bug here is that we risk losing performance data on tests
	# which flop to failure then back. What would be nice to do here is to
	# find the last value in a passing run, or to move to using proper
	# keyed reference runs.
	if self.failed:
	return UNCHANGED_FAIL
	elif self.prev_failed:
	return UNCHANGED_PASS

	# Always ignore percentage changes below MIN_PERCENTAGE_CHANGE %, for now, we just don't
	# have enough time to investigate that level of stuff.
	if ignore_small and abs(self.pct_delta) < MIN_PERCENTAGE_CHANGE:
	return UNCHANGED_PASS

	# Always ignore changes with small deltas. There is no mathematical
	# basis for this, it should be obviated by appropriate statistical
	# checks, but practical evidence indicates what we currently have isn't
	# good enough (for reasons I do not yet understand).
	if ignore_small and abs(self.delta) < MIN_PERCENTAGE_CHANGE:
	return UNCHANGED_PASS

	# Ignore tests whose delta is too small relative to the precision we
	# can sample at; otherwise quantization means that we can't measure the
	# standard deviation with enough accuracy.
	if abs(self.delta) <= 2 * value_precision * confidence_interval:
	return UNCHANGED_PASS

	# Use Mann-Whitney U test to test null hypothesis that result is
	# unchanged.
	if len(self.samples) >= 4 and len(self.prev_samples) >= 4:
	same = stats.mannwhitneyu(self.samples, self.prev_samples,
	self.confidence_lv)
	if same:
	return UNCHANGED_PASS

	# If we have a comparison window, then measure using a symmetic
	# confidence interval.
	if self.stddev is not None:
	is_significant = abs(self.delta) > (self.stddev *
	confidence_interval)

	# If the delta is significant, return
	if is_significant:
	if self.delta < 0:
	return REGRESSED if self.bigger_is_better else IMPROVED
	else:
	return IMPROVED if self.bigger_is_better else REGRESSED
	else:
	return UNCHANGED_PASS

	# Otherwise, report any changes above 0.2%, which is a rough
	# approximation for the smallest change we expect "could" be measured
	# accurately.
	if not ignore_small or abs(self.pct_delta) >= .002:
	if self.pct_delta < 0:
	return REGRESSED if self.bigger_is_better else IMPROVED
	else:
	return IMPROVED if self.bigger_is_better else REGRESSED
	else:
	return UNCHANGED_PASS


	class RunInfo(object):
	def __init__(self, session, testsuite, runs_to_load,
	aggregation_fn=stats.safe_min, confidence_lv=.05,
	only_tests=None):
	"""Get all the samples needed to build a CR.
	runs_to_load are the run IDs of the runs to get the samples from.
	if only_tests is passed, only samples form those test IDs are fetched.
	"""
	self.testsuite = testsuite
	self.aggregation_fn = aggregation_fn
	self.confidence_lv = confidence_lv

	self.sample_map = multidict.multidict()
	self.profile_map = dict()
	self.loaded_run_ids = set()

	self._load_samples_for_runs(session, runs_to_load, only_tests)

	@property
	def test_ids(self):
	return set(key[1] for key in self.sample_map.keys())

	def get_sliding_runs(self, session, run, compare_run,
	num_comparison_runs=0):
	"""
	Get num_comparison_runs most recent runs,
	This query is expensive.
	"""
	runs = [run]
	runs_prev = self.testsuite \
	.get_previous_runs_on_machine(session, run, num_comparison_runs)
	runs += runs_prev

	if compare_run is not None:
	compare_runs = [compare_run]
	comp_prev = self.testsuite \
	.get_previous_runs_on_machine(session, compare_run,
	num_comparison_runs)
	compare_runs += comp_prev
	else:
	compare_runs = []

	return runs, compare_runs

	def get_run_comparison_result(self, run, compare_to, test_id, field,
	hash_of_binary_field):
	if compare_to is not None:
	compare_to = [compare_to]
	else:
	compare_to = []
	return self.get_comparison_result([run], compare_to, test_id, field,
	hash_of_binary_field)

	def get_samples(self, runs, test_id):
	all_samples = []
	for run in runs:
	samples = self.sample_map.get((run.id, test_id))
	if samples is not None:
	all_samples.extend(samples)
	return all_samples

	def get_comparison_result(self, runs, compare_runs, test_id, field,
	hash_of_binary_field):
	# Get the field which indicates the requested field's status.
	status_field = field.status_field

	# Load the sample data for the current and previous runs and the
	# comparison window.
	run_samples = self.get_samples(runs, test_id)
	prev_samples = self.get_samples(compare_runs, test_id)

	cur_profile = prev_profile = None
	if runs:
	cur_profile = self.profile_map.get((runs[0].id, test_id), None)
	if compare_runs:
	prev_profile = self.profile_map.get((compare_runs[0].id, test_id),
	None)

	# Determine whether this (test,pset) passed or failed in the current
	# and previous runs.
	#
	# FIXME: Support XFAILs and non-determinism (mixed fail and pass)
	# better.
	run_failed = prev_failed = False
	if status_field:
	status_field_index = self.testsuite.get_field_index(status_field)
	for sample in run_samples:
	run_failed \|= sample[status_field_index] == FAIL
	for sample in prev_samples:
	prev_failed \|= sample[status_field_index] == FAIL

	field_index = self.testsuite.get_field_index(field)

	# Get the current and previous values.
	run_values = [s[field_index] for s in run_samples
	if s[field_index] is not None]
	prev_values = [s[field_index] for s in prev_samples
	if s[field_index] is not None]
	if hash_of_binary_field:
	hash_of_binary_field_index = \
	self.testsuite.get_field_index(hash_of_binary_field)
	hash_values = [s[hash_of_binary_field_index] for s in run_samples
	if s[hash_of_binary_field_index] is not None]
	prev_hash_values = [s[hash_of_binary_field_index]
	for s in prev_samples
	if s[field_index] is not None]

	# All current hash_values should all be the same.
	# Warn in the log when the hash wasn't the same for all samples.
	cur_hash_set = set(hash_values)
	if len(cur_hash_set) > 1:
	logger.warning("Found different hashes for multiple samples "
	"in the same run {0}: {1}\nTestID:{2}"
	.format(runs, hash_values, test_id))

	cur_hash = hash_values[0] if len(hash_values) > 0 else None
	prev_hash = prev_hash_values[0] \
	if len(prev_hash_values) > 0 else None
	else:
	cur_hash = None
	prev_hash = None
	r = ComparisonResult(self.aggregation_fn,
	run_failed, prev_failed, run_values,
	prev_values, cur_hash, prev_hash,
	cur_profile, prev_profile,
	self.confidence_lv,
	bigger_is_better=field.bigger_is_better)
	return r

	def get_geomean_comparison_result(self, run, compare_to, field, tests):
	unchanged_tests = [(cr.previous, cr.current, cr.prev_hash, cr.cur_hash)
	for _, _, cr in tests
	if cr.get_test_status() == UNCHANGED_PASS]
	if unchanged_tests:
	prev_values, run_values, prev_hash, cur_hash = zip(
	*unchanged_tests)
	prev_values = [x for x in prev_values if x is not None]
	run_values = [x for x in run_values if x is not None]
	prev_hash = [x for x in prev_hash if x is not None]
	cur_hash = [x for x in cur_hash if x is not None]
	prev_hash = prev_hash[0] if len(prev_hash) > 0 else None
	cur_hash = cur_hash[0] if len(cur_hash) > 0 else None
	prev_geo = calc_geomean(prev_values)
	prev_values = [prev_geo] if prev_geo else []
	run_values = [calc_geomean(run_values)]
	else:
	prev_values, run_values, prev_hash, cur_hash = [], [], None, None

	return ComparisonResult(self.aggregation_fn,
	cur_failed=not bool(run_values),
	prev_failed=not bool(prev_values),
	samples=run_values,
	prev_samples=prev_values,
	cur_hash=cur_hash,
	prev_hash=prev_hash,
	confidence_lv=0,
	bigger_is_better=field.bigger_is_better)

	def _load_samples_for_runs(self, session, run_ids, only_tests):
	# Find the set of new runs to load.
	to_load = set(run_ids) - self.loaded_run_ids
	if not to_load:
	return

	# Batch load all of the samples for the needed runs.
	#
	# We speed things up considerably by loading the column data directly
	# here instead of requiring SA to materialize Sample objects.
	columns = [self.testsuite.Sample.run_id,
	self.testsuite.Sample.test_id,
	self.testsuite.Sample.profile_id]
	columns.extend(f.column for f in self.testsuite.sample_fields)
	q = session.query(*columns)
	if only_tests:
	q = q.filter(self.testsuite.Sample.test_id.in_(only_tests))
	q = q.filter(self.testsuite.Sample.run_id.in_(to_load))
	for data in q:
	run_id = data[0]
	test_id = data[1]
	profile_id = data[2]
	sample_values = data[3:]
	self.sample_map[(run_id, test_id)] = sample_values
	if profile_id is not None:
	self.profile_map[(run_id, test_id)] = profile_id

	self.loaded_run_ids \|= to_load