From ccfd652bf5342ee93edfdca59f25f2e0255d9e43 Mon Sep 17 00:00:00 2001 From: shivanigithub Date: Wed, 2 Mar 2016 15:57:23 -0500 Subject: [PATCH] Create ab_analysis.py --- utils/analysis/ab_analysis.py | 213 ++++++++++++++++++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 utils/analysis/ab_analysis.py diff --git a/utils/analysis/ab_analysis.py b/utils/analysis/ab_analysis.py new file mode 100644 index 0000000000..ed55ed2270 --- /dev/null +++ b/utils/analysis/ab_analysis.py @@ -0,0 +1,213 @@ +#!/usr/bin/env python +# Copyright 2016 Google Inc. All Rights Reserved. + +""" +Does A/B analysis of two result sets (control and experiment). +For each URL the script outputs the change in metrics such as SpeedIndex, +first paint, or parser time, and whether the change is statistically +significant. + +The inputs are the Page Data files (at the bottom of bulk test result) + +Command line arguments: +-c +-e +-m + +If -m is not present, then SpeedIndex is the default metric. + +""" +from collections import defaultdict +import csv +import getopt +import math +import numpy +import sys +from scipy import stats + +# The standard WPT metrics we are interested in. Note that we provide some +# additional custom metrics, such as 'ParseTime'. +_METRICS = [ + 'SpeedIndex', + 'render' # time to first paint +] + +# Default metric to perform analysis on, if no metric is specified on the +# command line. +_DEFAULT_METRIC = 'SpeedIndex' + +# Minimum number of samples we require in order to perform analysis. +_MIN_SAMPLES = 4 + +# Populate CSV data as a list of dictionaries +def populateCsvData(filename): + data = list() + with open(filename,'rb') as f: + dict_reader = csv.DictReader(f) + for row in dict_reader: + data.append(row) + return data + +# Groups all runs for same URL together +def groupByUrl(data): + grouped = defaultdict(lambda: defaultdict(list)) + for row in data: + url = row['URL'] + if not url: + continue + result_code = row['result'] + if not result_code: + continue + result_code = int(result_code) + # pmeenan says that only results with codes of 0 or 99999 are valid. + if result_code not in (0, 99999): + continue + group = grouped[url] + for metric in _METRICS: + val = row[metric] + if not val: + continue + val = int(val) + if val <= 0: + continue + group[metric].append(val) + + # Generate a custom 'ParseTime' metric which looks at the delta between + # domInteractive and domLoading. + parse_start = row['chromeUserTiming.domLoading'] + parse_end = row['chromeUserTiming.domInteractive'] + if not parse_start or not parse_end: + continue + parse_start = int(parse_start) + parse_end = int(parse_end) + if parse_start <= 0 or parse_end <= 0: + continue + parse_time = parse_end - parse_start + group['ParseTime'].append(parse_time) + return grouped + +def mergeControlAndExperiment(control_data, experiment_data): + merged = defaultdict(dict) + for k, v in control_data.iteritems(): + merged[k]['control'] = v + for k, v in experiment_data.iteritems(): + merged[k]['experiment'] = v + return merged + +def discardOutliers(control_samples, experiment_samples): + def computeScore(candidate_samples, full_samples): + # Simple scoring function. Candidates with lower confidence intervals + # get higher scores. We also discount the score based on the ratio of sizes + # between the candidate and the full set of samples, to prefer larger sample + # sets in cases where confidence intervals between two candidates are + # similar. + candidate_mean, candidate_ci = computeMeanAndConfidenceInterval(candidate_samples) + partial_samples_ratio = float(len(candidate_samples)) / float(len(full_samples)) + if candidate_ci == 0: + return sys.float_info.max + return partial_samples_ratio / float(candidate_ci) + control_samples.sort() + experiment_samples.sort() + best_control_score = 0 + best_exp_score = 0 + best_control = None + best_exp = None + for control_candidate, exp_candidate in ( + (control_samples, experiment_samples), + (control_samples[1:], experiment_samples[1:]), + (control_samples[:-1], experiment_samples[:-1]), + (control_samples[1:-1], experiment_samples[1:-1])): + control_candidate_score = computeScore(control_candidate, control_samples) + exp_candidate_score = computeScore(exp_candidate, experiment_samples) + if (control_candidate_score > best_control_score and + exp_candidate_score > best_exp_score): + best_control_score = control_candidate_score + best_control = control_candidate + best_exp_score = exp_candidate_score + best_exp = exp_candidate + return (best_control, best_exp) + +def computeMeanAndConfidenceInterval(samples): + """Returns a list containing the mean and the half confidence interval.""" + mean = numpy.mean(samples) + ci = stats.norm.interval(0.95, + loc=mean, + scale=numpy.std(samples)/math.sqrt(len(samples))) + return (mean, mean - ci[0]) + +def writeOutput(merged_data, metric): + for url, url_data in merged_data.iteritems(): + if 'control' not in url_data or 'experiment' not in url_data: + continue + control_data = url_data['control'] + experiment_data = url_data['experiment'] + orig_control_samples = control_data[metric] + orig_experiment_samples = experiment_data[metric] + if (not orig_control_samples or not orig_experiment_samples or + len(orig_control_samples) < _MIN_SAMPLES or + len(orig_experiment_samples) < _MIN_SAMPLES): + continue + control_samples, experiment_samples = discardOutliers( + orig_control_samples, orig_experiment_samples) + control_mean, control_ci = computeMeanAndConfidenceInterval(control_samples) + exp_mean, exp_ci = computeMeanAndConfidenceInterval(experiment_samples) + + # The delta in means, less the combined confidence intervals. This tends to + # be a better metric to understand impact as it discounts the impact of + # large confidence intervals on the mean. + mean_delta = control_mean - exp_mean + combined_ci = control_ci + exp_ci + + mean_delta_less_ci = \ + mean_delta - combined_ci if mean_delta > 0 else mean_delta + combined_ci + percent_improvement = 100.0 * mean_delta_less_ci/max(control_mean, exp_mean) + is_significant = abs(mean_delta) > combined_ci + print u'{: >6.0f}ms {: >6.0f}ms {: >5.1f}% {} {: <60} {: >6.0f}ms \u00b1{:2.0f}% {: >6.0f}ms \u00b1{:2.0f}% {} {} {} {}'.format( + mean_delta, + mean_delta_less_ci if is_significant else 0, + percent_improvement if is_significant else 0, + u'+' if is_significant else u' ', + unicode((url[:57] + '...') if len(url) > 60 else url), + control_mean, + 100.0 * control_ci / control_mean, + exp_mean, + 100.0 * exp_ci / exp_mean, + control_samples, + experiment_samples, + sorted(list(set(orig_control_samples) - set(control_samples))), + sorted(list(set(orig_experiment_samples) - set(experiment_samples)))).encode('utf-8') + + +def main(argv): + try: + opts, args = getopt.getopt(argv[1:], "c:e:m:") + except getopt.GetoptError as err: + print str(err) + sys.exit(1) + + metric = _DEFAULT_METRIC + control_filename = None + experiment_filename = None + for k, v in opts: + if k == '-c': + control_filename = v + elif k == '-e': + experiment_filename = v + elif k == '-m': + metric = v + else: + print 'Unexpected arg %s' % k + sys.exit(1) + + if not control_filename or not experiment_filename: + print 'Must specify -c and -e.' + sys.exit(1) + + control_data = groupByUrl(populateCsvData(control_filename)) + experiment_data = groupByUrl(populateCsvData(experiment_filename)) + merged_data = mergeControlAndExperiment(control_data, experiment_data) + writeOutput(merged_data, metric) + + +if __name__ == "__main__": + main(sys.argv)