Estadistica descriptiva a los datos por medio de funciones de agregacion, es importate saber un poco de algebra relacional.
- Funciones de agregación
- min
- max
- moda
- conteo
- sumatoria
- media
- variaza
- desviacion estándar
- asimetria
- kurtosis
- Algebra relacional
- transposicion
- selección
- proyección
- uniones
- joins
- agrupación
import pandas as pd
from tabulate import tabulate
from typing import Tuple, List
import matplotlib
matplotlib.use('TKAgg')
def print_tabulate(df: pd.DataFrame):
print(tabulate(df, headers=df.columns, tablefmt='orgtbl'))<<imports>>
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
def categorize(name:str)->str:
if 'PREPARATORIA' in name or 'PREPA.' in name:
return 'PREPARATORIA'
if 'FACULTAD' in name or 'FAC.' in name:
return 'FACULTAD'
if 'HOSPITAL' in name:
return 'HOSPITAL'
if 'CENTRO' in name or 'CTRO.' in name or 'C.' in name or 'INVESTIGAC' in name :
return 'CENTRO'
if 'SECRETARÍA' in name or 'SECRETARIA' in name or 'SRIA.' in name or 'DIRECCIÓN' in name or 'DIRECCION' in name or \
'DEPARTAMENTO' in name or 'DEPTO.' in name or 'CONTRALORIA' in name or 'AUDITORIA' in name or 'TESORERIA' in name \
or 'ESCOLAR' in name or 'ABOGACÍA' in name or 'JUNTA' in name or 'RECTORIA' in name or 'IMAGEN' in name :
return 'ADMIN'
return 'OTRO'
def transform_into_typed_df(raw_df: pd.DataFrame)->pd.DataFrame:
raw_df["Fecha"] = pd.to_datetime(raw_df["anio"].map(str)+ "-" + raw_df["mes"].map(str), format="%Y-%m")
raw_df = raw_df.drop(['anio', 'mes'], axis=1)
raw_df["Tipo"] = raw_df["dependencia"].map(categorize)
return raw_df
def analysis_dependencia(df_complete: pd.DataFrame)-> pd.DataFrame:
df_complete["Fecha"] = pd.to_datetime(df_complete["Fecha"], format="%Y-%m-%d")
df_complete["anio"] = df_complete["Fecha"].dt.year
df_by_dep = df_complete.groupby(["Tipo", "anio"]).agg({'Sueldo Neto': ['sum', 'count', 'mean', 'min', 'max']})
df_by_dep = df_by_dep.reset_index()
df_by_dep.columns = ['Tipo', 'anio', 'Suma_Total_sueldos', 'Conteo_Empleados', 'Promedio_sueldo', 'Salario_Minimo', 'Salario_Maximo']
# print_tabulate(df_by_dep.head())
#df_by_dep = df_complete.groupby(["dependencia", "Fecha"]).agg({'Sueldo Neto': ['sum', 'count', 'mean', 'min', 'max']})
return df_by_dep
def create_boxplot_by_type(file_name:str, column: str, aggregate_functions=['sum']):
df_complete = pd.read_csv(file_name)
df_by_type = df_complete.groupby([column,"Fecha"]).agg({'Sueldo Neto': aggregate_functions})# .count()
df_by_type = df_by_type.reset_index()
df_by_type.columns = [column, 'Fecha'] + [f'sueldo_neto_{aggregate_function}' for aggregate_function in aggregate_functions]
df_by_type.boxplot(by = column, figsize=(27,18))
plt.xticks(rotation=90)
plt.savefig(f"img/boxplot_{column}.png")
plt.close()
def plot_by_dep(df: pd.DataFrame, dep:str)->None:
df[df["dependencia"] == dep].plot(y =["Sueldo Neto"])
plt.xticks(rotation=90)
plt.savefig(f"img/lt_{dep}.png")
# df[df["dependencia"] == dep].boxplot(by ='dependencia')
# plt.savefig(f"img/bplt_{dep}.png")
def create_plot_por_dependencia(file_name:str):
df_complete = pd.read_csv(file_name)
df_by_dep = df_complete.groupby(["dependencia", "Fecha"])[["Sueldo Neto"]].agg({'Sueldo Neto': ['sum']})
df_by_dep.reset_index(inplace=True)
df_by_dep.set_index("Fecha", inplace=True)
for dep in set(df_by_dep["dependencia"]):
plot_by_dep(df_by_dep, dep)
df_aux = df_complete.groupby(["Fecha","Tipo"])[['Sueldo Neto']].sum().unstack()
df_aux.plot(y = 'Sueldo Neto', legend=False, figsize=(32,18))
plt.xticks(rotation=90)
plt.savefig("img/foo.png")
plt.close()
def anova(df_aux: pd.DataFrame, str_ols: str):
# shaphiro-wills
# Levenes or barletts
modl = ols(str_ols, data=df_aux).fit()
anova_df = sm.stats.anova_lm(modl, typ=2)
if anova_df["PR(>F)"][0] < 0.005:
print("hay diferencias")
print(anova_df)
# Prueba tukey
# imprimir los resultados
else:
print("No hay diferencias")
def anova_1(file_name: str):
df_complete = pd.read_csv(file_name)
df_by_type = df_complete.groupby(["Tipo", "Fecha"])[["Sueldo Neto"]].aggregate(pd.DataFrame.sum)
df_by_type.reset_index(inplace=True)
# df_by_type.set_index("Fecha", inplace=True)
# df_by_type.reset_index(inplace=True)
df_aux = df_by_type.rename(columns={"Sueldo Neto": "GastoSalarios"}).drop(['Fecha'], axis=1)
df_aux = df_aux.loc[df_aux["Tipo"].isin(["CENTRO","OTRO"])]
# .isin(["ADMIN","CENTRO","OTRO","HOSPITAL","PREPARATORIA"])]
print(df_aux.head())
anova(df_aux, "GastoSalarios ~ Tipo")
def analysis(file_name:str)->None:
df_complete = pd.read_csv(file_name)
# print_tabulate(df_complete[["dependencia","Tipo"]].drop_duplicates().head(150))
df_by_dep = df_complete.groupby(["dependencia", "Fecha"])[["Sueldo Neto"]].aggregate(pd.DataFrame.sum)
df_by_type = df_complete.groupby(["Tipo", "Fecha"])[["Sueldo Neto"]].aggregate(pd.DataFrame.sum)# .count()
# df_by_dep_by_anio = df_by_dep.groupby(["dependencia","anio"]).aggregate(pd.DataFrame.sum).sort_values(by=["dependencia", "anio"], ascending=True)
df_by_dep.reset_index(inplace=True)
df_by_dep.set_index("Fecha", inplace=True)
# print_tabulate(df_by_dep.head(5))
# for dep in set(df_by_dep["dependencia"]):
# plot_by_dep(df_by_dep, dep)
# df_aux = df_complete.groupby(["Fecha","dependencia"])[['Sueldo Neto']].mean().unstack()
# df_aux.plot(y = 'Sueldo Neto', legend=False, figsize=(32,18))
# plt.xticks(rotation=90)
# plt.savefig("img/foo.png")
# plt.close()
df_by_type.boxplot(by = 'Tipo', figsize=(18,9))
plt.xticks(rotation=90)
plt.savefig("img/boxplot_tipo.png")
plt.close()
# aux = df_complete.groupby(["Tipo"])[["Sueldo Neto"]].aggregate(pd.DataFrame.sum)
# aux.reset_index(inplace=True)
df_by_type.reset_index(inplace=True)
df_aux = df_by_type.rename(columns={"Sueldo Neto": "GastoSalarios"}).drop(['Fecha'], axis=1)
print(df_aux.head())
# shaphiro-wills
# Levenes or barletts
modl = ols("GastoSalarios ~ Tipo", data=df_aux).fit()
anova_df = sm.stats.anova_lm(modl, typ=2)
if anova_df["PR(>F)"][0] < 0.005:
print("hay diferencias")
print(anova_df)
# Prueba tukey
# imprimir los resultados
else:
print("No hay diferencias")
# df_by_dep.boxplot(by ='dependencia', figsize=(32,18))
# plt.xticks(rotation=90)
# plt.savefig("img/boxplot.png")# , bbox_inches='tight')
# plt.close()
def create_typed_df(filename:str)-> pd.DataFrame:
df_complete = pd.read_csv(filename)
raw_df = transform_into_typed_df(df_complete)
return raw_df
def show_type_of_department():
df_complete = pd.read_csv("csv/typed_uanl.csv")
print_tabulate(df_complete[["dependencia","Tipo"]].\
drop_duplicates().head(150))
def show_data_by_dependency_and_date():
df_complete = pd.read_csv("csv/typed_uanl.csv")
df_by_dep = df_complete.groupby(["dependencia", "Fecha"])[["Sueldo Neto"]].aggregate(pd.DataFrame.sum)
df_by_dep.reset_index(inplace=True)
df_by_dep.set_index("Fecha", inplace=True)
print_tabulate(df_by_dep[df_by_dep["dependencia"]== "FAC. DE CIENCIAS FISICO-MATEMATICAS"].head(50))
def show_data_by_type_and_date():
df_complete = pd.read_csv("csv/typed_uanl.csv")
df_by_type = df_complete.groupby(["Tipo", "Fecha"])[["Sueldo Neto"]].aggregate(pd.DataFrame.sum)
df_by_type.reset_index(inplace=True)
df_by_type.set_index("Fecha", inplace=True)
print_tabulate(df_by_type.head(150))
def show_salary_and_count_by_type_and_date():
df_complete = pd.read_csv("csv/typed_uanl.csv")
df_by_type = df_complete.groupby(["Tipo", "Fecha"]).agg({'Sueldo Neto': ['sum', 'count', 'mean', 'min']})
df_by_type.reset_index(inplace=True)
df_by_type.columns = ['Tipo', 'Fecha', 'Total_sueldos', 'Conteo_Empleado', 'Promedio_sueldo', 'Salario_Maximo']
df_by_type.set_index("Fecha", inplace=True)
print_tabulate(df_by_type.head(150))
def show_salary_and_count_by_dependency_and_date():
df_complete = pd.read_csv("csv/typed_uanl.csv")
df_by_type = df_complete.groupby(["dependencia", "Fecha"]).agg({'Sueldo Neto': ['sum', 'count', 'mean', 'max']})
df_by_type.reset_index(inplace=True)
df_by_type.columns = ['Tipo', 'Fecha', 'Total_sueldos', 'Conteo_Empleado', 'Promedio_sueldo', 'Salario_Maximo']
df_by_type.set_index("Fecha", inplace=True)
print_tabulate(df_by_type)
if __name__ == "__main__":
# print_tabulate(typed_df.head(50))
# typed_df = create_typed_df("csv/uanl.csv")
# typed_df.to_csv("csv/typed_uanl.csv", index=False)
# typed_df = pd.read_csv("csv/typed_uanl.csv")
# analyzed_df = analysis_dependencia(typed_df)
# print_tabulate(analyzed_df)
# analyzed_df.to_csv("csv/analyzed_uanl.csv", index=False)
# show_data_by_dependency_and_date()
# show_data_by_type_and_date()
# show_salary_and_count_by_type_and_date()
# show_salary_and_count_by_dependency_and_date()
# analysis("csv/uanl.csv")
# create_boxplot_by_type("csv/typed_uanl.csv", 'Tipo', ["sum"])#"Tipo")
create_plot_por_dependencia("csv/typed_uanl.csv")
# anova_1("csv/typed_uanl.csv")<<imports>>
def analysis(file_name:str)->pd.DataFrame:
df = pd.read_csv(file_name)
df["hab_x_km2"] = df["poblacion_2020"] / df["area_km"]
df["hab_x_mi"] = df["poblacion_2020"] / df["area_mi"]
print(sum(df["poblacion_2020"]))
return df
df = analysis("csv/estados_limpio.csv")
print_tabulate(df.head())
print_tabulate(df.describe())
print(df["poblacion_2020"].sum())| estado | nombre_oficial | capital | ciudad_mas_grande | poblacion_2020 | num_de_municipios | lugar | fecha_de_admision | area_km | area_mi | hab_x_km2 | hab_x_mi | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Aguascalientes | Aguascalientes | Aguascalientes | Aguascalientes | 1.42561e+06 | 11 | 24 | 1857-02-05 | 5615.7 | 2168.2 | 253.861 | 657.507 |
| 1 | Baja California | Baja California | Mexicali | Tijuana | 3.76902e+06 | 6 | 29 | 1952-01-16 | 71450 | 27587 | 52.7505 | 136.623 |
| 2 | Baja California Sur | Baja California Sur | La Paz | La Paz | 798447 | 5 | 31 | 1974-10-08 | 73909.4 | 28536.6 | 10.8031 | 27.9798 |
| 3 | Campeche | Campeche | San Francisco de Campeche | San Francisco de Campeche | 928363 | 13 | 25 | 1863-04-29 | 57484.9 | 22195 | 16.1497 | 41.8276 |
| 4 | Chiapas | Chiapas | Tuxtla Gutiérrez | Tuxtla Gutiérrez | 5.54383e+06 | 124 | 19 | 1824-09-14 | 73311 | 28305.5 | 75.6207 | 195.857 |
| poblacion_2020 | num_de_municipios | lugar | area_km | area_mi | hab_x_km2 | hab_x_mi | |
|---|---|---|---|---|---|---|---|
| count | 32 | 32 | 32 | 32 | 32 | 32 | 32 |
| mean | 3.93794e+06 | 77.1875 | 16.5 | 61270.2 | 23656.6 | 309.679 | 802.022 |
| std | 3.27801e+06 | 105.268 | 9.38083 | 53819 | 20779.6 | 1078.71 | 2793.61 |
| min | 731391 | 5 | 1 | 1494.3 | 577 | 10.8031 | 27.9798 |
| 25% | 1.85165e+06 | 17.75 | 8.75 | 24136.1 | 9319.05 | 43.36 | 112.302 |
| 50% | 3.05489e+06 | 48.5 | 16.5 | 58041.8 | 22410 | 67.1707 | 173.971 |
| 75% | 4.94759e+06 | 89.5 | 24.25 | 74250.9 | 28668.4 | 159.016 | 411.848 |
| max | 1.69924e+07 | 570 | 32 | 247413 | 95526.5 | 6163.38 | 15961.8 |
126 014 024.0