From 3f1d43dc7323bb2b19ebb636ba10cb00255b1f52 Mon Sep 17 00:00:00 2001 From: Zack Sheffield Date: Sat, 30 Dec 2023 18:37:46 -0700 Subject: [PATCH] Adding fpstats module --- fpstats/__init__.py | 8 +++ fpstats/__main__.py | 124 +++++++++++++++++++++++++++++++++++++++ fpstats/from_bigquery.py | 61 +++++++++++++++++++ fpstats/from_pypi.py | 71 ++++++++++++++++++++++ 4 files changed, 264 insertions(+) create mode 100644 fpstats/__init__.py create mode 100644 fpstats/__main__.py create mode 100644 fpstats/from_bigquery.py create mode 100644 fpstats/from_pypi.py diff --git a/fpstats/__init__.py b/fpstats/__init__.py new file mode 100644 index 0000000..dc1240c --- /dev/null +++ b/fpstats/__init__.py @@ -0,0 +1,8 @@ +import itertools +from typing import Dict + + +def accumulate(monthly_data: Dict[str, int]) -> Dict[str, int]: + '''Take the given monthly data and generate cumulative data.''' + acums = itertools.accumulate(monthly_data.values()) + return dict(zip(monthly_data.keys(), acums)) diff --git a/fpstats/__main__.py b/fpstats/__main__.py new file mode 100644 index 0000000..fb54742 --- /dev/null +++ b/fpstats/__main__.py @@ -0,0 +1,124 @@ +#!/usr/bin/python3 +from argparse import ( + ArgumentParser, + Namespace, + RawDescriptionHelpFormatter, +) +from datetime import datetime +from typing import ( + Iterable, + List, +) + +import matplotlib.pyplot as plt +from matplotlib.dates import ( + DateFormatter, + MonthLocator, + YearLocator, +) + +from . import ( + from_pypi, + from_bigquery, + accumulate, +) + + +DESCRIPTION = 'Acquire pypi stats for the fixedpoint package.' +EPILOG = f'''\ +pypistats.org only keeps 180 days of history. Visit the following site to +obtain more history: + +{from_bigquery.BIGQUERY_URL} + +You can use the following SQL query to obtain the data: + +{from_bigquery.SQL_QUERY} + +Once the query returns data, save it in json format to a file and provide that +filename as the JSON argument. + +If this filename is not present, then pypistats.org data is used and will only +date back 180 days. +''' + + +def parse_args(clargs: List = None) -> Namespace: + '''Parse command line arguments.''' + parser = ArgumentParser( + formatter_class=RawDescriptionHelpFormatter, + description=DESCRIPTION, + epilog=EPILOG, + ) + parser.add_argument('json', + metavar='JSON', + help=('json file to parse; run --help to see ' + 'instructions on how to generate it'), + nargs='?', + default=None) + ret = parser.parse_args(clargs) + return ret + + +def plot(dates: Iterable[str], + monthly: Iterable[int], + cum: Iterable[int] = None) -> None: + '''Generate a plot.''' + dates_ = [datetime.strptime(d, '%Y-%m') for d in dates] + + fig = plt.figure() + ax = fig.add_subplot(211) + ax.plot_date(dates_, monthly, '.-') + if len(dates_) > 12: + ax.xaxis.set_major_locator(YearLocator()) + ax.xaxis.set_minor_locator(MonthLocator()) + ax.xaxis.set_major_formatter(DateFormatter('%Y')) + else: + ax.xaxis.set_major_locator(MonthLocator()) + ax.xaxis.set_major_formatter(DateFormatter('%Y-%m')) + + ax.fmt_xdata = DateFormatter('%Y-%m') + fig.autofmt_xdate() + ax.set_title('Monthly fixpoint Downloads') + plt.xlabel('Month') + plt.ylabel('# Downloads') + plt.grid(True) + + if cum is not None: + ax = fig.add_subplot(212) + ax.plot_date(dates_, cum, '.-') + if len(dates_) > 12: + ax.xaxis.set_major_locator(YearLocator()) + ax.xaxis.set_minor_locator(MonthLocator()) + ax.xaxis.set_major_formatter(DateFormatter('%Y')) + else: + ax.xaxis.set_major_locator(MonthLocator()) + ax.xaxis.set_major_formatter(DateFormatter('%Y-%m')) + + ax.fmt_xdata = DateFormatter('%Y-%m') + fig.autofmt_xdate() + ax.set_title('Cumulative fixpoint Downloads') + plt.xlabel('Month') + plt.ylabel('# Downloads') + plt.grid(True) + + plt.show() + + +def main(clargs: List = None) -> None: + arg = parse_args() + + try: + stats = from_bigquery.stats_monthly(arg.json) + except TypeError: + stats = from_pypi.stats_monthly(arg.json) + + cum = accumulate(stats) + + plot(stats.keys(), + stats.values(), + cum.values()) + + +if __name__ == '__main__': + main() diff --git a/fpstats/from_bigquery.py b/fpstats/from_bigquery.py new file mode 100644 index 0000000..b2bd681 --- /dev/null +++ b/fpstats/from_bigquery.py @@ -0,0 +1,61 @@ +'''Fixedpoint package downloads statistics from BigQuery. + +pypistats (https://pypistats.org/) only keeps statistics from the last +180 days, so to get all-time download history, Google Cloud's BigQuery +pypi downloads tables must be used. + +To do this, navigate to +https://bigquery.cloud.google.com/table/bigquery-public-data:pypi.downloads +and use query shown in the SQL_QUERY parameter in this module. + +Export/copy the data as a json file and use this script to point to it. +''' +import json +from pathlib import Path +from typing import ( + Any, + Dict, + Union, +) + +BIGQUERY_URL = 'https://bigquery.cloud.google.com/table/bigquery-public-data:pypi.downloads' # noqa +SQL_QUERY = '''\ +#standardSQL +SELECT + COUNT(*) AS num_downloads, + DATE_TRUNC(DATE(timestamp), MONTH) AS `month` +FROM `bigquery-public-data.pypi.file_downloads` +WHERE + file.project = 'fixedpoint' + AND DATE(timestamp) + BETWEEN '2020-04-01' -- fixedpoint 1.0.0 published on this date + AND CURRENT_DATE() +GROUP BY `month` +ORDER BY `month` ASC''' + + +def stats_monthly(filename: Union[str, Path], + *args: Any, + **kwargs: Any) -> Dict[str, int]: + '''Get monthly pypi download stats from BigQuery data. + + Args: + filename: path to json-formatted file of BigQuery results. + + Returns: + Ordered dict of monthy downloads, keyed by yyyy-mm date string + and valued by the number of downloads that month. + + File contents must be ordered (per SQL_QUERY) and in the following format: + [ + { + "num_downloads": "42", + "month": " 2020-04-01" + }, + ... + ] + ''' + # Load json file + with open(filename) as fp: + data = json.load(fp) + return {entry["month"][:-3]: int(entry["num_downloads"]) for entry in data} diff --git a/fpstats/from_pypi.py b/fpstats/from_pypi.py new file mode 100644 index 0000000..68c7b44 --- /dev/null +++ b/fpstats/from_pypi.py @@ -0,0 +1,71 @@ +'''Fixedpoint package downloads statistics from pypi. + +This uses pypistats (https://pypistats.org/) to gather statistics on +the last 180 days. +''' +from datetime import date, datetime, timedelta +import json +from typing import ( + Any, + List, + Dict, +) +import warnings + +import pypistats as pps +from prodict import Prodict + + +class CategoryDownloads(Prodict): + downloads: int + + +class Stats(Prodict): + data: List[CategoryDownloads] + + +# Suppress 180-day warning issued by pypistats +warnings.simplefilter("ignore", UserWarning) + + +def stats_monthly(*args: Any, **kwargs: Any) -> Dict[str, int]: + '''Get monthly pypi download stats. + + Returns: + Ordered dict of monthy downloads, keyed by yyyy-mm date string + and valued by the number of downloads that month. + ''' + TODAY = date.today() + ONE_DAY = timedelta(days=1) + ret = {} + query = Prodict(format='json', color='no') + for year in range(2020, TODAY.year + 1): + for month in range(12): + + # Determine start and end dates + startdt = datetime.strptime( + f'{year}-{month + 1:02d}-01', + '%Y-%m-%d') + next_month = (month + 1) % 12 + nextmonthdt = datetime.strptime( + f'{year}-{next_month + 1:02d}-01', + '%Y-%m-%d') + enddt = nextmonthdt - ONE_DAY + query.start_date = startdt.strftime('%Y-%m-%d') + query.end_date = enddt.strftime('%Y-%m-%d') + + # Get data + try: + response = pps.overall('fixedpoint', **query) + except ValueError: + continue + + # Format data + raw = Stats.from_dict(json.loads(response)) + + # Parse Data + key = f'{year}-{month + 1:02d}' + ret[key] = max(raw.data[0].downloads, + raw.data[1].downloads) + + return ret