Skip to content

Latest commit

 

History

History
executable file
·
234 lines (200 loc) · 13.5 KB

File metadata and controls

executable file
·
234 lines (200 loc) · 13.5 KB

Forecasting

UANL

#+Name forecasting_uanl

import matplotlib.pyplot as plt
import statsmodels.api as sm
import numbers
import pandas as pd
from tabulate import tabulate
from statsmodels.stats.outliers_influence import summary_table
from typing import Tuple, Dict
import numpy as np


def print_tabulate(df: pd.DataFrame):
    print(tabulate(df, headers=df.columns, tablefmt="orgtbl"))

def transform_variable(df: pd.DataFrame, x:str)->pd.Series:
    if isinstance(df[x][0], numbers.Number):
        return df[x] # type: pd.Series
    else:
        return pd.Series([i for i in range(0, len(df[x]))])

def linear_regression(df: pd.DataFrame, x:str, y: str)->Dict[str, float]:
    fixed_x = transform_variable(df, x)
    model= sm.OLS(df[y],sm.add_constant(fixed_x)).fit()
    bands = pd.read_html(model.summary().tables[1].as_html(),header=0,index_col=0)[0]
    print_tabulate(pd.read_html(model.summary().tables[1].as_html(),header=0,index_col=0)[0])
    coef = pd.read_html(model.summary().tables[1].as_html(),header=0,index_col=0)[0]['coef']
    r_2_t = pd.read_html(model.summary().tables[0].as_html(),header=None,index_col=None)[0]
    return {'m': coef.values[1], 'b': coef.values[0], 'r2': r_2_t.values[0][3], 'r2_adj': r_2_t.values[1][3], 'low_band': bands['[0.025'][0], 'hi_band': bands['0.975]'][0]}

def plt_lr(df: pd.DataFrame, x:str, y: str, m: float, b: float, r2: float, r2_adj: float, low_band: float, hi_band: float, colors: Tuple[str,str]):
    fixed_x = transform_variable(df, x)
    df.plot(x=x,y=y, kind='scatter')
    plt.plot(df[x],[ m * x + b for _, x in fixed_x.items()], color=colors[0])
    plt.fill_between(df[x],
                     [ m * x  + low_band for _, x in fixed_x.items()],
                     [ m * x + hi_band for _, x in fixed_x.items()], alpha=0.2, color=colors[1])





df = pd.read_csv("csv/typed_uanl.csv") # type: pd.DataFrame
#print_tabulate(df.head(50))
df_by_sal = df.groupby("Fecha")\
              .aggregate(sueldo_mensual=pd.NamedAgg(column="Sueldo Neto", aggfunc=pd.DataFrame.mean))
df_by_sal.reset_index(inplace=True)
# df_by_sal["sueldo_mensual"] = df_by_sal["sueldo_mensual"]**10
#print_tabulate(df_by_sal.head(5))
a = linear_regression(df_by_sal, "Fecha", "sueldo_mensual")
plt_lr(df=df_by_sal, x="Fecha", y="sueldo_mensual", colors=('red', 'orange'), **a)

plt.xticks(rotation=90)
plt.savefig('img/lr_sueldo_mensual_Fecha_m.png')
plt.close()
coefstd errtP>t[0.0250.975]
const10800555.6719.44409652.112000
0175.68641.3984.244089.832261.541

img/lr_sueldo_mensual_Fecha_m.png

forecasting

#+Name forecasting

import matplotlib.pyplot as plt
import statsmodels.api as sm
import numbers
import pandas as pd
from tabulate import tabulate
from statsmodels.stats.outliers_influence import summary_table
from typing import Tuple, Dict
import numpy as np


def print_tabulate(df: pd.DataFrame):
    print(tabulate(df, headers=df.columns, tablefmt="orgtbl"))

def transform_variable(df: pd.DataFrame, x:str)->pd.Series:
    if isinstance(df[x][df.index[0]], numbers.Number):
        return df[x] # type: pd.Series
    else:
        return pd.Series([i for i in range(0, len(df[x]))])

def linear_regression(df: pd.DataFrame, x:str, y: str)->Dict[str, float]:
    fixed_x = transform_variable(df, x)
    model= sm.OLS(list(df[y]),sm.add_constant(fixed_x)).fit()
    bands = pd.read_html(model.summary().tables[1].as_html(),header=0,index_col=0)[0]
    #print_tabulate(pd.read_html(model.summary().tables[1].as_html(),header=0,index_col=0)[0])
    coef = pd.read_html(model.summary().tables[1].as_html(),header=0,index_col=0)[0]['coef']
    r_2_t = pd.read_html(model.summary().tables[0].as_html(),header=None,index_col=None)[0]
    return {'m': coef.values[1], 'b': coef.values[0], 'r2': r_2_t.values[0][3], 'r2_adj': r_2_t.values[1][3], 'low_band': bands['[0.025'][0], 'hi_band': bands['0.975]'][0]}

def plt_lr(df: pd.DataFrame, x:str, y: str, m: float, b: float, r2: float, r2_adj: float, low_band: float, hi_band: float, colors: Tuple[str,str]):
    fixed_x = transform_variable(df, x)
    plt.plot(df[x],[ m * x + b for _, x in fixed_x.items()], color=colors[0])
    plt.fill_between(df[x],
                     [ m * x  + low_band for _, x in fixed_x.items()],
                     [ m * x + hi_band for _, x in fixed_x.items()], alpha=0.2, color=colors[1])


def normalize_distribution(dist: np.array, n: int)->np.array:
    b = dist - min(dist) + 0.000001
    c = (b / np.sum(b)) * n
    return np.round(c)

begin_date = '2016-01-01'
end_date = '2023-01-01'
date_range = pd.date_range(start=begin_date, end=end_date, freq='1D')
norm_dist = np.random.standard_normal(len(date_range))
sales = normalize_distribution(norm_dist, 100*len(date_range))
df = pd.DataFrame({'Fecha': date_range, 'ventas': sales}) # type: pd.DataFrame
df.to_csv('csv/sales.csv', index=False)

full_df = pd.read_csv('csv/sales.csv')
print(f"full -> mean: {np.mean(full_df['ventas'])}, sd: {np.std(full_df['ventas'])}")
df = full_df.tail(50)
x = "Fecha"
y= "ventas"
full_df[full_df["Fecha"]>'2022-12-01'].plot(x=x,y=y, kind='scatter')
plt.xticks(rotation=90)
plt.savefig('img/full_ventas_Fecha_m.png')
plt.close()

df.plot(x=x,y=y, kind='scatter')
a = linear_regression(df, x,y)
plt_lr(df=df, x=x, y=y, colors=('red', 'orange'), **a)
a = linear_regression(df.tail(5), x,y)
plt_lr(df=df.tail(5), x=x, y=y, colors=('red', 'orange'), **a)
df_j = df[pd.to_datetime(df[x]).dt.dayofweek == 1]
print_tabulate(df_j)
a = linear_regression(df_j, x,y)
plt_lr(df=df_j, x=x, y=y, colors=('blue', 'blue'), **a)
#
plt.xticks(rotation=90)
plt.savefig('img/lr_ventas_Fecha_m.png')
plt.close()
df2 = full_df.loc[(pd.to_datetime(full_df[x])>='2019-11-11') & (pd.to_datetime(full_df[x]) < '2020-01-02')]
dfs = [
    ('50D', df),
    ('10D', df.tail(10)),
    ('5D', df.tail(5)),
    ('jueves', df[pd.to_datetime(df[x]).dt.dayofweek == 1]),
    ('50D-1Y', df2),
    ('10D-Y', df2.tail(10)),
    ('5D-Y', df2.tail(5)),
    ('jueves-Y', df2[pd.to_datetime(df2[x]).dt.dayofweek == 1]),
]
lrs = [(title, linear_regression(_df,x=x,y=y), len(_df)) for title, _df in dfs]
lrs_p = [(title, lr_dict["m"]*size  + lr_dict["b"], lr_dict) for title, lr_dict, size in lrs]
print(lrs_p)
full -> mean: 99.99413604378421, sd: 29.088195349566618 /usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 5 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 5 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 5 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ”
Fechaventas
25102022-11-1598
25172022-11-22100
25242022-11-2995
25312022-12-06130
25382022-12-13129
25452022-12-20115
25522022-12-2774

/usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 7 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 7 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 7 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/scipy/stats/_stats_py.py:1769: UserWarning: kurtosistest only valid for n>=20 … continuing anyway, n=10 warnings.warn(“kurtosistest only valid for n>=20 … continuing ” /usr/local/lib64/python3.9/site-packages/scipy/stats/_stats_py.py:1769: UserWarning: kurtosistest only valid for n>=20 … continuing anyway, n=10 warnings.warn(“kurtosistest only valid for n>=20 … continuing ” /usr/local/lib64/python3.9/site-packages/scipy/stats/_stats_py.py:1769: UserWarning: kurtosistest only valid for n>=20 … continuing anyway, n=10 warnings.warn(“kurtosistest only valid for n>=20 … continuing ” /usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 5 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 5 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 5 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 7 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 7 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 7 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/scipy/stats/_stats_py.py:1769: UserWarning: kurtosistest only valid for n>=20 … continuing anyway, n=10 warnings.warn(“kurtosistest only valid for n>=20 … continuing ” /usr/local/lib64/python3.9/site-packages/scipy/stats/_stats_py.py:1769: UserWarning: kurtosistest only valid for n>=20 … continuing anyway, n=10 warnings.warn(“kurtosistest only valid for n>=20 … continuing ” /usr/local/lib64/python3.9/site-packages/scipy/stats/_stats_py.py:1769: UserWarning: kurtosistest only valid for n>=20 … continuing anyway, n=10 warnings.warn(“kurtosistest only valid for n>=20 … continuing ” /usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 5 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 5 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 5 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/scipy/stats/_stats_py.py:1769: UserWarning: kurtosistest only valid for n>=20 … continuing anyway, n=8 warnings.warn(“kurtosistest only valid for n>=20 … continuing ” /usr/local/lib64/python3.9/site-packages/scipy/stats/_stats_py.py:1769: UserWarning: kurtosistest only valid for n>=20 … continuing anyway, n=8 warnings.warn(“kurtosistest only valid for n>=20 … continuing ” /usr/local/lib64/python3.9/site-packages/scipy/stats/_stats_py.py:1769: UserWarning: kurtosistest only valid for n>=20 … continuing anyway, n=8 warnings.warn(“kurtosistest only valid for n>=20 … continuing ” [(‘50D’, 96.7985, {‘m’: -0.2557, ‘b’: 109.5835, ‘r2’: 0.016, ‘r2_adj’: -0.004, ‘low_band’: 93.258, ‘hi_band’: 125.909}), (‘10D’, 105.7336, {‘m’: 1.897, ‘b’: 86.7636, ‘r2’: 0.023, ‘r2_adj’: -0.099, ‘low_band’: 33.207, ‘hi_band’: 140.32}), (‘5D’, 128.0, {‘m’: 7.4, ‘b’: 91.0, ‘r2’: 0.154, ‘r2_adj’: -0.128, ‘low_band’: 12.875, ‘hi_band’: 169.125}), (‘jueves’, 104.7144, {‘m’: -0.2857, ‘b’: 106.7143, ‘r2’: 0.001, ‘r2_adj’: -0.199, ‘low_band’: 68.112, ‘hi_band’: 145.317}), (‘50D-1Y’, 104.0542, {‘m’: 0.0391, ‘b’: 102.021, ‘r2’: 0.001, ‘r2_adj’: -0.019, ‘low_band’: 87.626, ‘hi_band’: 116.416}), (‘10D-Y’, 97.6, {‘m’: -1.0, ‘b’: 107.6, ‘r2’: 0.006, ‘r2_adj’: -0.118, ‘low_band’: 50.622, ‘hi_band’: 164.578}), (‘5D-Y’, 96.3, {‘m’: 0.3, ‘b’: 94.8, ‘r2’: 0.0, ‘r2_adj’: -0.333, ‘low_band’: 4.514, ‘hi_band’: 185.086}), (‘jueves-Y’, 109.2859, {‘m’: 1.4524, ‘b’: 97.6667, ‘r2’: 0.019, ‘r2_adj’: -0.145, ‘low_band’: 53.559, ‘hi_band’: 141.774})]

img/full_ventas_Fecha_m.png

img/lr_ventas_Fecha_m.png img/full_ventas_Fecha_m.png