#+Name forecasting_uanl
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numbers
import pandas as pd
from tabulate import tabulate
from statsmodels.stats.outliers_influence import summary_table
from typing import Tuple, Dict
import numpy as np
def print_tabulate(df: pd.DataFrame):
print(tabulate(df, headers=df.columns, tablefmt="orgtbl"))
def transform_variable(df: pd.DataFrame, x:str)->pd.Series:
if isinstance(df[x][0], numbers.Number):
return df[x] # type: pd.Series
else:
return pd.Series([i for i in range(0, len(df[x]))])
def linear_regression(df: pd.DataFrame, x:str, y: str)->Dict[str, float]:
fixed_x = transform_variable(df, x)
model= sm.OLS(df[y],sm.add_constant(fixed_x)).fit()
bands = pd.read_html(model.summary().tables[1].as_html(),header=0,index_col=0)[0]
print_tabulate(pd.read_html(model.summary().tables[1].as_html(),header=0,index_col=0)[0])
coef = pd.read_html(model.summary().tables[1].as_html(),header=0,index_col=0)[0]['coef']
r_2_t = pd.read_html(model.summary().tables[0].as_html(),header=None,index_col=None)[0]
return {'m': coef.values[1], 'b': coef.values[0], 'r2': r_2_t.values[0][3], 'r2_adj': r_2_t.values[1][3], 'low_band': bands['[0.025'][0], 'hi_band': bands['0.975]'][0]}
def plt_lr(df: pd.DataFrame, x:str, y: str, m: float, b: float, r2: float, r2_adj: float, low_band: float, hi_band: float, colors: Tuple[str,str]):
fixed_x = transform_variable(df, x)
df.plot(x=x,y=y, kind='scatter')
plt.plot(df[x],[ m * x + b for _, x in fixed_x.items()], color=colors[0])
plt.fill_between(df[x],
[ m * x + low_band for _, x in fixed_x.items()],
[ m * x + hi_band for _, x in fixed_x.items()], alpha=0.2, color=colors[1])
df = pd.read_csv("csv/typed_uanl.csv") # type: pd.DataFrame
#print_tabulate(df.head(50))
df_by_sal = df.groupby("Fecha")\
.aggregate(sueldo_mensual=pd.NamedAgg(column="Sueldo Neto", aggfunc=pd.DataFrame.mean))
df_by_sal.reset_index(inplace=True)
# df_by_sal["sueldo_mensual"] = df_by_sal["sueldo_mensual"]**10
#print_tabulate(df_by_sal.head(5))
a = linear_regression(df_by_sal, "Fecha", "sueldo_mensual")
plt_lr(df=df_by_sal, x="Fecha", y="sueldo_mensual", colors=('red', 'orange'), **a)
plt.xticks(rotation=90)
plt.savefig('img/lr_sueldo_mensual_Fecha_m.png')
plt.close()| coef | std err | t | P> | t | [0.025 | 0.975] | ||
|---|---|---|---|---|---|---|---|---|
| const | 10800 | 555.67 | 19.444 | 0 | 9652.1 | 12000 | ||
| 0 | 175.686 | 41.398 | 4.244 | 0 | 89.832 | 261.541 |
#+Name forecasting
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numbers
import pandas as pd
from tabulate import tabulate
from statsmodels.stats.outliers_influence import summary_table
from typing import Tuple, Dict
import numpy as np
def print_tabulate(df: pd.DataFrame):
print(tabulate(df, headers=df.columns, tablefmt="orgtbl"))
def transform_variable(df: pd.DataFrame, x:str)->pd.Series:
if isinstance(df[x][df.index[0]], numbers.Number):
return df[x] # type: pd.Series
else:
return pd.Series([i for i in range(0, len(df[x]))])
def linear_regression(df: pd.DataFrame, x:str, y: str)->Dict[str, float]:
fixed_x = transform_variable(df, x)
model= sm.OLS(list(df[y]),sm.add_constant(fixed_x)).fit()
bands = pd.read_html(model.summary().tables[1].as_html(),header=0,index_col=0)[0]
#print_tabulate(pd.read_html(model.summary().tables[1].as_html(),header=0,index_col=0)[0])
coef = pd.read_html(model.summary().tables[1].as_html(),header=0,index_col=0)[0]['coef']
r_2_t = pd.read_html(model.summary().tables[0].as_html(),header=None,index_col=None)[0]
return {'m': coef.values[1], 'b': coef.values[0], 'r2': r_2_t.values[0][3], 'r2_adj': r_2_t.values[1][3], 'low_band': bands['[0.025'][0], 'hi_band': bands['0.975]'][0]}
def plt_lr(df: pd.DataFrame, x:str, y: str, m: float, b: float, r2: float, r2_adj: float, low_band: float, hi_band: float, colors: Tuple[str,str]):
fixed_x = transform_variable(df, x)
plt.plot(df[x],[ m * x + b for _, x in fixed_x.items()], color=colors[0])
plt.fill_between(df[x],
[ m * x + low_band for _, x in fixed_x.items()],
[ m * x + hi_band for _, x in fixed_x.items()], alpha=0.2, color=colors[1])
def normalize_distribution(dist: np.array, n: int)->np.array:
b = dist - min(dist) + 0.000001
c = (b / np.sum(b)) * n
return np.round(c)
begin_date = '2016-01-01'
end_date = '2023-01-01'
date_range = pd.date_range(start=begin_date, end=end_date, freq='1D')
norm_dist = np.random.standard_normal(len(date_range))
sales = normalize_distribution(norm_dist, 100*len(date_range))
df = pd.DataFrame({'Fecha': date_range, 'ventas': sales}) # type: pd.DataFrame
df.to_csv('csv/sales.csv', index=False)
full_df = pd.read_csv('csv/sales.csv')
print(f"full -> mean: {np.mean(full_df['ventas'])}, sd: {np.std(full_df['ventas'])}")
df = full_df.tail(50)
x = "Fecha"
y= "ventas"
full_df[full_df["Fecha"]>'2022-12-01'].plot(x=x,y=y, kind='scatter')
plt.xticks(rotation=90)
plt.savefig('img/full_ventas_Fecha_m.png')
plt.close()
df.plot(x=x,y=y, kind='scatter')
a = linear_regression(df, x,y)
plt_lr(df=df, x=x, y=y, colors=('red', 'orange'), **a)
a = linear_regression(df.tail(5), x,y)
plt_lr(df=df.tail(5), x=x, y=y, colors=('red', 'orange'), **a)
df_j = df[pd.to_datetime(df[x]).dt.dayofweek == 1]
print_tabulate(df_j)
a = linear_regression(df_j, x,y)
plt_lr(df=df_j, x=x, y=y, colors=('blue', 'blue'), **a)
#
plt.xticks(rotation=90)
plt.savefig('img/lr_ventas_Fecha_m.png')
plt.close()
df2 = full_df.loc[(pd.to_datetime(full_df[x])>='2019-11-11') & (pd.to_datetime(full_df[x]) < '2020-01-02')]
dfs = [
('50D', df),
('10D', df.tail(10)),
('5D', df.tail(5)),
('jueves', df[pd.to_datetime(df[x]).dt.dayofweek == 1]),
('50D-1Y', df2),
('10D-Y', df2.tail(10)),
('5D-Y', df2.tail(5)),
('jueves-Y', df2[pd.to_datetime(df2[x]).dt.dayofweek == 1]),
]
lrs = [(title, linear_regression(_df,x=x,y=y), len(_df)) for title, _df in dfs]
lrs_p = [(title, lr_dict["m"]*size + lr_dict["b"], lr_dict) for title, lr_dict, size in lrs]
print(lrs_p)
| Fecha | ventas | |
|---|---|---|
| 2510 | 2022-11-15 | 98 |
| 2517 | 2022-11-22 | 100 |
| 2524 | 2022-11-29 | 95 |
| 2531 | 2022-12-06 | 130 |
| 2538 | 2022-12-13 | 129 |
| 2545 | 2022-12-20 | 115 |
| 2552 | 2022-12-27 | 74 |
/usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 7 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 7 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 7 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/scipy/stats/_stats_py.py:1769: UserWarning: kurtosistest only valid for n>=20 … continuing anyway, n=10 warnings.warn(“kurtosistest only valid for n>=20 … continuing ” /usr/local/lib64/python3.9/site-packages/scipy/stats/_stats_py.py:1769: UserWarning: kurtosistest only valid for n>=20 … continuing anyway, n=10 warnings.warn(“kurtosistest only valid for n>=20 … continuing ” /usr/local/lib64/python3.9/site-packages/scipy/stats/_stats_py.py:1769: UserWarning: kurtosistest only valid for n>=20 … continuing anyway, n=10 warnings.warn(“kurtosistest only valid for n>=20 … continuing ” /usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 5 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 5 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 5 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 7 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 7 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 7 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/scipy/stats/_stats_py.py:1769: UserWarning: kurtosistest only valid for n>=20 … continuing anyway, n=10 warnings.warn(“kurtosistest only valid for n>=20 … continuing ” /usr/local/lib64/python3.9/site-packages/scipy/stats/_stats_py.py:1769: UserWarning: kurtosistest only valid for n>=20 … continuing anyway, n=10 warnings.warn(“kurtosistest only valid for n>=20 … continuing ” /usr/local/lib64/python3.9/site-packages/scipy/stats/_stats_py.py:1769: UserWarning: kurtosistest only valid for n>=20 … continuing anyway, n=10 warnings.warn(“kurtosistest only valid for n>=20 … continuing ” /usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 5 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 5 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 5 samples were given. warn(“omni_normtest is not valid with less than 8 observations; %i ” /usr/local/lib64/python3.9/site-packages/scipy/stats/_stats_py.py:1769: UserWarning: kurtosistest only valid for n>=20 … continuing anyway, n=8 warnings.warn(“kurtosistest only valid for n>=20 … continuing ” /usr/local/lib64/python3.9/site-packages/scipy/stats/_stats_py.py:1769: UserWarning: kurtosistest only valid for n>=20 … continuing anyway, n=8 warnings.warn(“kurtosistest only valid for n>=20 … continuing ” /usr/local/lib64/python3.9/site-packages/scipy/stats/_stats_py.py:1769: UserWarning: kurtosistest only valid for n>=20 … continuing anyway, n=8 warnings.warn(“kurtosistest only valid for n>=20 … continuing ” [(‘50D’, 96.7985, {‘m’: -0.2557, ‘b’: 109.5835, ‘r2’: 0.016, ‘r2_adj’: -0.004, ‘low_band’: 93.258, ‘hi_band’: 125.909}), (‘10D’, 105.7336, {‘m’: 1.897, ‘b’: 86.7636, ‘r2’: 0.023, ‘r2_adj’: -0.099, ‘low_band’: 33.207, ‘hi_band’: 140.32}), (‘5D’, 128.0, {‘m’: 7.4, ‘b’: 91.0, ‘r2’: 0.154, ‘r2_adj’: -0.128, ‘low_band’: 12.875, ‘hi_band’: 169.125}), (‘jueves’, 104.7144, {‘m’: -0.2857, ‘b’: 106.7143, ‘r2’: 0.001, ‘r2_adj’: -0.199, ‘low_band’: 68.112, ‘hi_band’: 145.317}), (‘50D-1Y’, 104.0542, {‘m’: 0.0391, ‘b’: 102.021, ‘r2’: 0.001, ‘r2_adj’: -0.019, ‘low_band’: 87.626, ‘hi_band’: 116.416}), (‘10D-Y’, 97.6, {‘m’: -1.0, ‘b’: 107.6, ‘r2’: 0.006, ‘r2_adj’: -0.118, ‘low_band’: 50.622, ‘hi_band’: 164.578}), (‘5D-Y’, 96.3, {‘m’: 0.3, ‘b’: 94.8, ‘r2’: 0.0, ‘r2_adj’: -0.333, ‘low_band’: 4.514, ‘hi_band’: 185.086}), (‘jueves-Y’, 109.2859, {‘m’: 1.4524, ‘b’: 97.6667, ‘r2’: 0.019, ‘r2_adj’: -0.145, ‘low_band’: 53.559, ‘hi_band’: 141.774})]


