Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions fpstats/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import itertools
from typing import Dict


def accumulate(monthly_data: Dict[str, int]) -> Dict[str, int]:
'''Take the given monthly data and generate cumulative data.'''
acums = itertools.accumulate(monthly_data.values())
return dict(zip(monthly_data.keys(), acums))
124 changes: 124 additions & 0 deletions fpstats/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#!/usr/bin/python3
from argparse import (
ArgumentParser,
Namespace,
RawDescriptionHelpFormatter,
)
from datetime import datetime
from typing import (
Iterable,
List,
)

import matplotlib.pyplot as plt
from matplotlib.dates import (
DateFormatter,
MonthLocator,
YearLocator,
)

from . import (
from_pypi,
from_bigquery,
accumulate,
)


DESCRIPTION = 'Acquire pypi stats for the fixedpoint package.'
EPILOG = f'''\
pypistats.org only keeps 180 days of history. Visit the following site to
obtain more history:

{from_bigquery.BIGQUERY_URL}

You can use the following SQL query to obtain the data:

{from_bigquery.SQL_QUERY}

Once the query returns data, save it in json format to a file and provide that
filename as the JSON argument.

If this filename is not present, then pypistats.org data is used and will only
date back 180 days.
'''


def parse_args(clargs: List = None) -> Namespace:
'''Parse command line arguments.'''
parser = ArgumentParser(
formatter_class=RawDescriptionHelpFormatter,
description=DESCRIPTION,
epilog=EPILOG,
)
parser.add_argument('json',
metavar='JSON',
help=('json file to parse; run --help to see '
'instructions on how to generate it'),
nargs='?',
default=None)
ret = parser.parse_args(clargs)
return ret


def plot(dates: Iterable[str],
monthly: Iterable[int],
cum: Iterable[int] = None) -> None:
'''Generate a plot.'''
dates_ = [datetime.strptime(d, '%Y-%m') for d in dates]

fig = plt.figure()
ax = fig.add_subplot(211)
ax.plot_date(dates_, monthly, '.-')
if len(dates_) > 12:
ax.xaxis.set_major_locator(YearLocator())
ax.xaxis.set_minor_locator(MonthLocator())
ax.xaxis.set_major_formatter(DateFormatter('%Y'))
else:
ax.xaxis.set_major_locator(MonthLocator())
ax.xaxis.set_major_formatter(DateFormatter('%Y-%m'))

ax.fmt_xdata = DateFormatter('%Y-%m')
fig.autofmt_xdate()
ax.set_title('Monthly fixpoint Downloads')
plt.xlabel('Month')
plt.ylabel('# Downloads')
plt.grid(True)

if cum is not None:
ax = fig.add_subplot(212)
ax.plot_date(dates_, cum, '.-')
if len(dates_) > 12:
ax.xaxis.set_major_locator(YearLocator())
ax.xaxis.set_minor_locator(MonthLocator())
ax.xaxis.set_major_formatter(DateFormatter('%Y'))
else:
ax.xaxis.set_major_locator(MonthLocator())
ax.xaxis.set_major_formatter(DateFormatter('%Y-%m'))

ax.fmt_xdata = DateFormatter('%Y-%m')
fig.autofmt_xdate()
ax.set_title('Cumulative fixpoint Downloads')
plt.xlabel('Month')
plt.ylabel('# Downloads')
plt.grid(True)

plt.show()


def main(clargs: List = None) -> None:
arg = parse_args()

try:
stats = from_bigquery.stats_monthly(arg.json)
except TypeError:
stats = from_pypi.stats_monthly(arg.json)

cum = accumulate(stats)

plot(stats.keys(),
stats.values(),
cum.values())


if __name__ == '__main__':
main()
61 changes: 61 additions & 0 deletions fpstats/from_bigquery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
'''Fixedpoint package downloads statistics from BigQuery.

pypistats (https://pypistats.org/) only keeps statistics from the last
180 days, so to get all-time download history, Google Cloud's BigQuery
pypi downloads tables must be used.

To do this, navigate to
https://bigquery.cloud.google.com/table/bigquery-public-data:pypi.downloads
and use query shown in the SQL_QUERY parameter in this module.

Export/copy the data as a json file and use this script to point to it.
'''
import json
from pathlib import Path
from typing import (
Any,
Dict,
Union,
)

BIGQUERY_URL = 'https://bigquery.cloud.google.com/table/bigquery-public-data:pypi.downloads' # noqa
SQL_QUERY = '''\
#standardSQL
SELECT
COUNT(*) AS num_downloads,
DATE_TRUNC(DATE(timestamp), MONTH) AS `month`
FROM `bigquery-public-data.pypi.file_downloads`
WHERE
file.project = 'fixedpoint'
AND DATE(timestamp)
BETWEEN '2020-04-01' -- fixedpoint 1.0.0 published on this date
AND CURRENT_DATE()
GROUP BY `month`
ORDER BY `month` ASC'''


def stats_monthly(filename: Union[str, Path],
*args: Any,
**kwargs: Any) -> Dict[str, int]:
'''Get monthly pypi download stats from BigQuery data.

Args:
filename: path to json-formatted file of BigQuery results.

Returns:
Ordered dict of monthy downloads, keyed by yyyy-mm date string
and valued by the number of downloads that month.

File contents must be ordered (per SQL_QUERY) and in the following format:
[
{
"num_downloads": "42",
"month": " 2020-04-01"
},
...
]
'''
# Load json file
with open(filename) as fp:
data = json.load(fp)
return {entry["month"][:-3]: int(entry["num_downloads"]) for entry in data}
71 changes: 71 additions & 0 deletions fpstats/from_pypi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
'''Fixedpoint package downloads statistics from pypi.

This uses pypistats (https://pypistats.org/) to gather statistics on
the last 180 days.
'''
from datetime import date, datetime, timedelta
import json
from typing import (
Any,
List,
Dict,
)
import warnings

import pypistats as pps
from prodict import Prodict


class CategoryDownloads(Prodict):
downloads: int


class Stats(Prodict):
data: List[CategoryDownloads]


# Suppress 180-day warning issued by pypistats
warnings.simplefilter("ignore", UserWarning)


def stats_monthly(*args: Any, **kwargs: Any) -> Dict[str, int]:
'''Get monthly pypi download stats.

Returns:
Ordered dict of monthy downloads, keyed by yyyy-mm date string
and valued by the number of downloads that month.
'''
TODAY = date.today()
ONE_DAY = timedelta(days=1)
ret = {}
query = Prodict(format='json', color='no')
for year in range(2020, TODAY.year + 1):
for month in range(12):

# Determine start and end dates
startdt = datetime.strptime(
f'{year}-{month + 1:02d}-01',
'%Y-%m-%d')
next_month = (month + 1) % 12
nextmonthdt = datetime.strptime(
f'{year}-{next_month + 1:02d}-01',
'%Y-%m-%d')
enddt = nextmonthdt - ONE_DAY
query.start_date = startdt.strftime('%Y-%m-%d')
query.end_date = enddt.strftime('%Y-%m-%d')

# Get data
try:
response = pps.overall('fixedpoint', **query)
except ValueError:
continue

# Format data
raw = Stats.from_dict(json.loads(response))

# Parse Data
key = f'{year}-{month + 1:02d}'
ret[key] = max(raw.data[0].downloads,
raw.data[1].downloads)

return ret