Data_analysis

Estadistica descriptiva a los datos por medio de funciones de agregacion, es importate saber un poco de algebra relacional.

Funciones de agregación
- min
- max
- moda
- conteo
- sumatoria
- media
- variaza
- desviacion estándar
- asimetria
- kurtosis
Algebra relacional
- transposicion
- selección
- proyección
- uniones
- joins
- agrupación

Imports

import pandas as pd
from tabulate import tabulate
from typing import Tuple, List
import matplotlib

matplotlib.use('TKAgg')

def print_tabulate(df: pd.DataFrame):
    print(tabulate(df, headers=df.columns, tablefmt='orgtbl'))

UANL

<<imports>>
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols

def categorize(name:str)->str:
    if 'PREPARATORIA' in name or 'PREPA.' in name:
        return 'PREPARATORIA'
    if 'FACULTAD' in name or 'FAC.' in name:
        return 'FACULTAD'
    if 'HOSPITAL' in name:
        return 'HOSPITAL'
    if 'CENTRO' in name or 'CTRO.' in name or 'C.' in name or 'INVESTIGAC' in name :
        return 'CENTRO'
    if 'SECRETARÍA' in name or 'SECRETARIA' in name or 'SRIA.' in name or 'DIRECCIÓN' in name or 'DIRECCION' in name or \
       'DEPARTAMENTO' in name or 'DEPTO.' in name or 'CONTRALORIA' in name or 'AUDITORIA' in name or 'TESORERIA' in name \
       or 'ESCOLAR' in name or 'ABOGACÍA' in name  or 'JUNTA' in name  or 'RECTORIA' in name  or 'IMAGEN' in name :
        return 'ADMIN'
    return 'OTRO'

def transform_into_typed_df(raw_df: pd.DataFrame)->pd.DataFrame:
    raw_df["Fecha"] = pd.to_datetime(raw_df["anio"].map(str)+ "-" + raw_df["mes"].map(str), format="%Y-%m")
    raw_df = raw_df.drop(['anio', 'mes'], axis=1)
    raw_df["Tipo"] = raw_df["dependencia"].map(categorize)
    return raw_df


def analysis_dependencia(df_complete: pd.DataFrame)-> pd.DataFrame:
    df_complete["Fecha"] = pd.to_datetime(df_complete["Fecha"], format="%Y-%m-%d")
    df_complete["anio"] = df_complete["Fecha"].dt.year
    df_by_dep = df_complete.groupby(["Tipo", "anio"]).agg({'Sueldo Neto': ['sum', 'count', 'mean', 'min', 'max']})
    df_by_dep = df_by_dep.reset_index()
    df_by_dep.columns = ['Tipo', 'anio', 'Suma_Total_sueldos', 'Conteo_Empleados', 'Promedio_sueldo', 'Salario_Minimo', 'Salario_Maximo']
    # print_tabulate(df_by_dep.head())
    #df_by_dep = df_complete.groupby(["dependencia", "Fecha"]).agg({'Sueldo Neto': ['sum', 'count', 'mean', 'min', 'max']})
    return df_by_dep


def create_boxplot_by_type(file_name:str, column: str, aggregate_functions=['sum']):
    df_complete = pd.read_csv(file_name)
    df_by_type = df_complete.groupby([column,"Fecha"]).agg({'Sueldo Neto': aggregate_functions})# .count()
    df_by_type = df_by_type.reset_index()
    df_by_type.columns = [column, 'Fecha'] + [f'sueldo_neto_{aggregate_function}' for aggregate_function in aggregate_functions]
    df_by_type.boxplot(by = column, figsize=(27,18))
    plt.xticks(rotation=90)
    plt.savefig(f"img/boxplot_{column}.png")
    plt.close()


def plot_by_dep(df: pd.DataFrame, dep:str)->None:
    df[df["dependencia"] == dep].plot(y =["Sueldo Neto"])
    plt.xticks(rotation=90)
    plt.savefig(f"img/lt_{dep}.png")
    # df[df["dependencia"] == dep].boxplot(by ='dependencia')
    # plt.savefig(f"img/bplt_{dep}.png")


def create_plot_por_dependencia(file_name:str):
    df_complete = pd.read_csv(file_name)
    df_by_dep = df_complete.groupby(["dependencia", "Fecha"])[["Sueldo Neto"]].agg({'Sueldo Neto': ['sum']})
    df_by_dep.reset_index(inplace=True)
    df_by_dep.set_index("Fecha", inplace=True)

    for dep in set(df_by_dep["dependencia"]):
       plot_by_dep(df_by_dep, dep)


    df_aux = df_complete.groupby(["Fecha","Tipo"])[['Sueldo Neto']].sum().unstack()
    df_aux.plot(y = 'Sueldo Neto', legend=False, figsize=(32,18))
    plt.xticks(rotation=90)
    plt.savefig("img/foo.png")
    plt.close()

def anova(df_aux: pd.DataFrame, str_ols: str):
    # shaphiro-wills
    # Levenes or barletts
    modl = ols(str_ols, data=df_aux).fit()
    anova_df = sm.stats.anova_lm(modl, typ=2)
    if anova_df["PR(>F)"][0] < 0.005:
        print("hay diferencias")
        print(anova_df)
        # Prueba tukey
        # imprimir los resultados
    else:
        print("No hay diferencias")

def anova_1(file_name: str):
    df_complete = pd.read_csv(file_name)
    df_by_type = df_complete.groupby(["Tipo", "Fecha"])[["Sueldo Neto"]].aggregate(pd.DataFrame.sum)
    df_by_type.reset_index(inplace=True)
    # df_by_type.set_index("Fecha", inplace=True)
    # df_by_type.reset_index(inplace=True)
    df_aux = df_by_type.rename(columns={"Sueldo Neto": "GastoSalarios"}).drop(['Fecha'], axis=1)
    df_aux = df_aux.loc[df_aux["Tipo"].isin(["CENTRO","OTRO"])]
 # .isin(["ADMIN","CENTRO","OTRO","HOSPITAL","PREPARATORIA"])]
    print(df_aux.head())
    anova(df_aux, "GastoSalarios ~ Tipo")

def analysis(file_name:str)->None:
    df_complete = pd.read_csv(file_name)
    # print_tabulate(df_complete[["dependencia","Tipo"]].drop_duplicates().head(150))
    df_by_dep = df_complete.groupby(["dependencia", "Fecha"])[["Sueldo Neto"]].aggregate(pd.DataFrame.sum)
    df_by_type = df_complete.groupby(["Tipo", "Fecha"])[["Sueldo Neto"]].aggregate(pd.DataFrame.sum)# .count()

    # df_by_dep_by_anio = df_by_dep.groupby(["dependencia","anio"]).aggregate(pd.DataFrame.sum).sort_values(by=["dependencia", "anio"], ascending=True)
    df_by_dep.reset_index(inplace=True)
    df_by_dep.set_index("Fecha", inplace=True)
    # print_tabulate(df_by_dep.head(5))

    # for dep in set(df_by_dep["dependencia"]):
    #    plot_by_dep(df_by_dep, dep)
    # df_aux = df_complete.groupby(["Fecha","dependencia"])[['Sueldo Neto']].mean().unstack()
    # df_aux.plot(y = 'Sueldo Neto', legend=False, figsize=(32,18))
    # plt.xticks(rotation=90)
    # plt.savefig("img/foo.png")
    # plt.close()

    df_by_type.boxplot(by = 'Tipo', figsize=(18,9))
    plt.xticks(rotation=90)
    plt.savefig("img/boxplot_tipo.png")
    plt.close()

    # aux = df_complete.groupby(["Tipo"])[["Sueldo Neto"]].aggregate(pd.DataFrame.sum)
    # aux.reset_index(inplace=True)
    df_by_type.reset_index(inplace=True)
    df_aux = df_by_type.rename(columns={"Sueldo Neto": "GastoSalarios"}).drop(['Fecha'], axis=1)
    print(df_aux.head())

    # shaphiro-wills
    # Levenes or barletts
    modl = ols("GastoSalarios ~ Tipo", data=df_aux).fit()
    anova_df = sm.stats.anova_lm(modl, typ=2)
    if anova_df["PR(>F)"][0] < 0.005:
        print("hay diferencias")
        print(anova_df)
        # Prueba tukey
        # imprimir los resultados
    else:
        print("No hay diferencias")



    # df_by_dep.boxplot(by ='dependencia', figsize=(32,18))
    # plt.xticks(rotation=90)
    # plt.savefig("img/boxplot.png")# , bbox_inches='tight')
    # plt.close()



def create_typed_df(filename:str)-> pd.DataFrame:
    df_complete = pd.read_csv(filename)
    raw_df = transform_into_typed_df(df_complete)
    return raw_df

def show_type_of_department():
    df_complete = pd.read_csv("csv/typed_uanl.csv")
    print_tabulate(df_complete[["dependencia","Tipo"]].\
                   drop_duplicates().head(150))

def show_data_by_dependency_and_date():
    df_complete = pd.read_csv("csv/typed_uanl.csv")
    df_by_dep = df_complete.groupby(["dependencia", "Fecha"])[["Sueldo Neto"]].aggregate(pd.DataFrame.sum)
    df_by_dep.reset_index(inplace=True)
    df_by_dep.set_index("Fecha", inplace=True)
    print_tabulate(df_by_dep[df_by_dep["dependencia"]== "FAC. DE CIENCIAS FISICO-MATEMATICAS"].head(50))


def show_data_by_type_and_date():
    df_complete = pd.read_csv("csv/typed_uanl.csv")
    df_by_type = df_complete.groupby(["Tipo", "Fecha"])[["Sueldo Neto"]].aggregate(pd.DataFrame.sum)
    df_by_type.reset_index(inplace=True)
    df_by_type.set_index("Fecha", inplace=True)
    print_tabulate(df_by_type.head(150))


def show_salary_and_count_by_type_and_date():
    df_complete = pd.read_csv("csv/typed_uanl.csv")
    df_by_type = df_complete.groupby(["Tipo", "Fecha"]).agg({'Sueldo Neto': ['sum', 'count', 'mean', 'min']})
    df_by_type.reset_index(inplace=True)
    df_by_type.columns = ['Tipo', 'Fecha', 'Total_sueldos', 'Conteo_Empleado', 'Promedio_sueldo', 'Salario_Maximo']
    df_by_type.set_index("Fecha", inplace=True)
    print_tabulate(df_by_type.head(150))

def show_salary_and_count_by_dependency_and_date():
    df_complete = pd.read_csv("csv/typed_uanl.csv")
    df_by_type = df_complete.groupby(["dependencia", "Fecha"]).agg({'Sueldo Neto': ['sum', 'count', 'mean', 'max']})
    df_by_type.reset_index(inplace=True)
    df_by_type.columns = ['Tipo', 'Fecha', 'Total_sueldos', 'Conteo_Empleado', 'Promedio_sueldo', 'Salario_Maximo']
    df_by_type.set_index("Fecha", inplace=True)
    print_tabulate(df_by_type)



if __name__ == "__main__":
    # print_tabulate(typed_df.head(50))
    # typed_df = create_typed_df("csv/uanl.csv")
    # typed_df.to_csv("csv/typed_uanl.csv", index=False)
    # typed_df = pd.read_csv("csv/typed_uanl.csv")
    # analyzed_df = analysis_dependencia(typed_df)
    # print_tabulate(analyzed_df)

    # analyzed_df.to_csv("csv/analyzed_uanl.csv", index=False)
    # show_data_by_dependency_and_date()
    # show_data_by_type_and_date()
    # show_salary_and_count_by_type_and_date()
    # show_salary_and_count_by_dependency_and_date()
    # analysis("csv/uanl.csv")

    # create_boxplot_by_type("csv/typed_uanl.csv", 'Tipo', ["sum"])#"Tipo")
    create_plot_por_dependencia("csv/typed_uanl.csv")

    # anova_1("csv/typed_uanl.csv")

var/home/jhernandez.local/lib/python3.12/site-packages/pandas/plotting/_matplotlib/core.py:580: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). Consider using `matplotlib.pyplot.close()`. fig = self.plt.figure(figsize=self.figsize)

Estados

<<imports>>
def analysis(file_name:str)->pd.DataFrame:
    df = pd.read_csv(file_name)
    df["hab_x_km2"] = df["poblacion_2020"] / df["area_km"]
    df["hab_x_mi"] = df["poblacion_2020"] / df["area_mi"]
    print(sum(df["poblacion_2020"]))
    return df

df = analysis("csv/estados_limpio.csv")
print_tabulate(df.head())
print_tabulate(df.describe())
print(df["poblacion_2020"].sum())

126 014 024.0

	estado	nombre_oficial	capital	ciudad_mas_grande	poblacion_2020	num_de_municipios	lugar	fecha_de_admision	area_km	area_mi	hab_x_km2	hab_x_mi
0	Aguascalientes	Aguascalientes	Aguascalientes	Aguascalientes	1.42561e+06	11	24	1857-02-05	5615.7	2168.2	253.861	657.507
1	Baja California	Baja California	Mexicali	Tijuana	3.76902e+06	6	29	1952-01-16	71450	27587	52.7505	136.623
2	Baja California Sur	Baja California Sur	La Paz	La Paz	798447	5	31	1974-10-08	73909.4	28536.6	10.8031	27.9798
3	Campeche	Campeche	San Francisco de Campeche	San Francisco de Campeche	928363	13	25	1863-04-29	57484.9	22195	16.1497	41.8276
4	Chiapas	Chiapas	Tuxtla Gutiérrez	Tuxtla Gutiérrez	5.54383e+06	124	19	1824-09-14	73311	28305.5	75.6207	195.857

	poblacion_2020	num_de_municipios	lugar	area_km	area_mi	hab_x_km2	hab_x_mi
count	32	32	32	32	32	32	32
mean	3.93794e+06	77.1875	16.5	61270.2	23656.6	309.679	802.022
std	3.27801e+06	105.268	9.38083	53819	20779.6	1078.71	2793.61
min	731391	5	1	1494.3	577	10.8031	27.9798
25%	1.85165e+06	17.75	8.75	24136.1	9319.05	43.36	112.302
50%	3.05489e+06	48.5	16.5	58041.8	22410	67.1707	173.971
75%	4.94759e+06	89.5	24.25	74250.9	28668.4	159.016	411.848
max	1.69924e+07	570	32	247413	95526.5	6163.38	15961.8

126 014 024.0

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Data_analysis

Imports

UANL

Estados

FilesExpand file tree

data_analysis.org

Latest commit

History

data_analysis.org

File metadata and controls

Data_analysis

Imports

UANL

Estados