elleobrien · othrou · Feb 25, 2025 · Feb 25, 2025 · Feb 25, 2025 · Feb 25, 2025
diff --git a/.github/workflows/cml.yaml b/.github/workflows/cml.yaml
@@ -0,0 +1,18 @@
+name: model-wine-quality
+on: [push]
+jobs:
+  run:
+    runs-on: [ubuntu-latest]
+    container: docker://dvcor/cml-py3:latest
+    steps:
+      - uses: actions/checkout@v2
+        name: cml_run
+        env:
+          repo_token: ${{ secrets.GITHUB_TOKEN }}
+      - run: |
+          # Your ML workflow goes here
+          pip install -r requirements.txt
+          python train.py
+
+          echo "MODEL METRICS"
+          cat metrics.txt
diff --git a/README.md b/README.md
@@ -1,2 +1,3 @@
-# Wine quality prediction
-Modelling a Kaggle dataset of [red wine properties and quality ratings](https://www.kaggle.com/uciml/red-wine-quality-cortez-et-al-2009). 
+# House Pricing prediction
+
+Modelling a Kaggle dataset Bouston Housing Prediction
diff --git a/housing_prediction.py b/housing_prediction.py
@@ -0,0 +1,87 @@
+import pandas as pd 
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.datasets import load_boston
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+# Set random seed
+seed = 42
+
+################################
+########## DATA PREP ###########
+################################
+
+# Load in the Boston housing dataset
+boston = load_boston()
+df = pd.DataFrame(boston.data, columns=boston.feature_names)
+df['PRICE'] = boston.target  # Add target column
+
+# Split into train and test sections
+y = df.pop("PRICE")
+X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=seed)
+
+#################################
+########## MODELLING ############
+#################################
+
+# Fit a model on the train section
+regr = RandomForestRegressor(max_depth=2, random_state=seed)
+regr.fit(X_train, y_train)
+
+# Report training set score
+train_score = regr.score(X_train, y_train) * 100
+# Report test set score
+test_score = regr.score(X_test, y_test) * 100
+
+# Write scores to a file
+with open("metrics.txt", 'w') as outfile:
+    outfile.write("Training variance explained: %2.1f%%\n" % train_score)
+    outfile.write("Test variance explained: %2.1f%%\n" % test_score)
+
+
+##########################################
+##### PLOT FEATURE IMPORTANCE ############
+##########################################
+# Calculate feature importance in random forest
+importances = regr.feature_importances_
+labels = df.columns
+feature_df = pd.DataFrame(list(zip(labels, importances)), columns = ["feature","importance"])
+feature_df = feature_df.sort_values(by='importance', ascending=False,)
+
+# image formatting
+axis_fs = 18 #fontsize
+title_fs = 22 #fontsize
+sns.set(style="whitegrid")
+
+ax = sns.barplot(x="importance", y="feature", data=feature_df)
+ax.set_xlabel('Importance', fontsize = axis_fs) 
+ax.set_ylabel('Feature', fontsize = axis_fs)  # ylabel
+ax.set_title('Random forest\nfeature importance', fontsize = title_fs)
+
+plt.tight_layout()
+plt.savefig("feature_importance.png", dpi=120) 
+plt.close()
+
+
+##########################################
+############ PLOT RESIDUALS  #############
+##########################################
+
+y_pred = regr.predict(X_test) + np.random.normal(0, 0.25, len(y_test))
+y_jitter = y_test + np.random.normal(0, 0.25, len(y_test))
+res_df = pd.DataFrame(list(zip(y_jitter, y_pred)), columns=["true", "pred"])
+
+ax = sns.scatterplot(x="true", y="pred", data=res_df)
+ax.set_aspect('equal')
+ax.set_xlabel('True house price', fontsize = axis_fs) 
+ax.set_ylabel('Predicted house price', fontsize = axis_fs)  # ylabel
+ax.set_title('Residuals', fontsize = title_fs)
+
+# Make it pretty- square aspect ratio
+ax.plot([1, 50], [1, 50], 'black', linewidth=1)
+plt.ylim((10, 50))
+plt.xlim((10, 50))
+
+plt.tight_layout()
+plt.savefig("residuals.png", dpi=120)