import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
gold_train = pd.read_csv("https://code.s3.yandex.net/datasets/gold_recovery_train.csv")
gold_train.head()
gold_train.info()
gold_test = pd.read_csv("https://code.s3.yandex.net/datasets/gold_recovery_test.csv")
gold_test.head()
gold_test.info()
gold_full = pd.read_csv("https://code.s3.yandex.net/datasets/gold_recovery_full.csv")
gold_full.head()
gold_full.info()
C = gold_train["rougher.output.concentrate_au"]
F = gold_train["rougher.input.feed_au"]
T = gold_train["rougher.output.tail_au"]
gold_train["rougher.output.recovery.calculated"] = (C * (F - T)) / (F * (C - T)) * 100
mae = (gold_train["rougher.output.recovery.calculated"] - gold_train["rougher.output.recovery"]).abs().sum() / len(gold_train)
print("MAE", ":", mae)
gold_train.columns.difference(gold_test.columns)
gold_train = gold_train.ffill()
gold_test = gold_test.ffill()
gold_full = gold_full.ffill()
gold_full_merge = gold_full[["date", "rougher.output.recovery", "final.output.recovery", "rougher.output.concentrate_au", "rougher.output.concentrate_ag", "rougher.output.concentrate_pb", "rougher.output.concentrate_sol", "final.output.concentrate_au", "final.output.concentrate_ag", "final.output.concentrate_pb", "final.output.concentrate_sol"]]
gold_test = gold_test.merge(gold_full_merge, on="date", how="left")
gold_full_merge = gold_full_merge.drop(['date', 'rougher.output.recovery', 'final.output.recovery'], axis=1)
gold_train = gold_train.drop("date", axis=1)
gold_test = gold_test.drop("date", axis=1)
gold_full = gold_full.drop("date", axis=1)
Filled the NaN values of the dataframes using ffill.
Merged necessary data in the full dataframe to the test dataframe.
Dropped unecessary date column in dataframes.
Filled the NaNs using ffill as it forward fills the NaNs with the last valid observation. Added required info to test dataframe from full dataframe and removed date columns which are not needed for the model fitting.
metal_au = gold_full[["rougher.input.feed_au", "rougher.output.concentrate_au", "primary_cleaner.output.concentrate_au", "final.output.concentrate_au"]]
metal_ag = gold_full[["rougher.input.feed_ag", "rougher.output.concentrate_ag", "primary_cleaner.output.concentrate_ag", "final.output.concentrate_ag"]]
metal_pb = gold_full[["rougher.input.feed_pb", "rougher.output.concentrate_pb", "primary_cleaner.output.concentrate_pb", "final.output.concentrate_pb"]]
fig, axes = plt.subplots(3, 1, figsize=(10, 20))
for column in list(metal_au):
sns.distplot(metal_au[column], ax=axes[0], kde=False)
axes[0].set(title="Au", xlabel="Concentration %", ylabel="Amount")
for column in list(metal_ag):
sns.distplot(metal_ag[column], ax=axes[1], kde=False)
axes[1].set(title="Ag", xlabel="Concentration %", ylabel="Amount")
for column in list(metal_pb):
sns.distplot(metal_pb[column], ax=axes[2], kde=False)
axes[2].set(title="Pb", xlabel="Concentration %", ylabel="Amount")
fig.suptitle("Metal Concentraions at Purification Stages")
fig.legend(["rougher.input.feed", "rougher.output.concentrate", "primary.cleaner.output concentrate", "final.output.concentrate",])
fig.show()
Plotted Au, Ag, and Pb metal concentrations at purification stage.
The concentration of Au increases unifomrly throughout the purification stage. The concentraion of Ag increases and decreases slightly throughout the stage resulting in a net decrease. The concnetration of Pb increases slightly throughout the stage resulting in a net increase similar to the primary cleaner output concentrate.
fig, axes = plt.subplots(2,1, figsize=(10, 15))
axes[0].hist(gold_train["primary_cleaner.input.feed_size"], density=True, alpha=0.5, bins=50)
axes[0].hist(gold_test["primary_cleaner.input.feed_size"], density=True, alpha=0.5, bins=50)
axes[0].set(title="Primary Cleaner Input Feed Size", xlabel="Amount", ylabel="Size")
axes[1].hist(gold_train["rougher.input.feed_size"], density=True, alpha=0.5, bins=50)
axes[1].hist(gold_test["rougher.input.feed_size"], density=True, alpha=0.5, bins=50)
axes[1].set(title="Rougher Input Feed Size", xlabel="Amount", ylabel="Size")
fig.suptitle("Feed Particle Size Distribution")
fig.legend(["Train Set", "Test Set"])
fig.show()
def raw_feed(df):
return df["rougher.input.feed_au"] + df["rougher.input.feed_ag"] + df["rougher.input.feed_pb"] + df["rougher.input.feed_sol"]
def rougher_conc(df):
return df["rougher.output.concentrate_au"] + df["rougher.output.concentrate_ag"] + df["rougher.output.concentrate_pb"] + df["rougher.output.concentrate_sol"]
def final_conc(df):
return df["final.output.concentrate_au"] + df["final.output.concentrate_ag"] + df["final.output.concentrate_pb"] + df["final.output.concentrate_sol"]
gold_full["rougher.input.feed"] = raw_feed(gold_full)
gold_full["rougher.output.concentrate"] = rougher_conc(gold_full)
gold_full["final.output.concentrate"] = final_conc(gold_full)
total_conc = gold_full[["rougher.input.feed", "rougher.output.concentrate", "final.output.concentrate"]]
fig = plt.figure(figsize=(10, 6))
for column in list(total_conc):
sns.distplot(total_conc[column], kde=False)
plt.legend(list(total_conc))
plt.title("Total Concentration at Stages")
plt.xlabel("Concentration %")
plt.ylabel("Amount")
fig.show()
def final_smape(y, y_hat):
y_rougher = y.iloc[:, 0]
y_hat_rougher = y_hat.iloc[:, 0]
rougher_num = np.abs(y_rougher - y_hat_rougher)
rougher_den = (np.abs(y_rougher) + np.abs(y_hat_rougher)) / 2
smape_rougher = np.mean(rougher_num / rougher_den) * 100
y_final = y.iloc[:, 1]
y_hat_final = y_hat.iloc[:, 1]
final_num = np.abs(y_final - y_hat_final)
final_den = (np.abs(y_final) + np.abs(y_hat_final)) / 2
smape_final = np.mean(final_num / final_den) * 100
final_smape = smape_rougher * 0.25 + smape_final * 0.75
return final_smape
smape_scorer = make_scorer(final_smape, greater_is_better=False)
Use these formulas for evaluation metrics: $$\mathit{sMAPE} = \frac{1}{N} \sum_{i=1}^{N} \frac{\left| y_i - \hat{y}_i \right|}{\left( \left| y_i \right| + \left|\hat{y}_i \right| \right) / 2} \times 100\%$$ $$\mathit{Final\ sMAPE} = 25\% \times \mathit{sMAPE}\left( \mathit{rougher} \right) + 75\% \times \mathit{sMAPE} \left( \mathit{final} \right)$$
gold_train["rougher.input.feed"] = raw_feed(gold_train)
gold_train["rougher.output.concentrate"] = rougher_conc(gold_train)
gold_train["final.output.concentrate"] = final_conc(gold_train)
gold_train = gold_train[(gold_train["rougher.input.feed"] > 20) & (gold_train["rougher.output.concentrate"] > 20) & (gold_train["final.output.concentrate"] > 20)]
gold_train = gold_train.drop(["rougher.input.feed", "rougher.output.concentrate", "final.output.concentrate"], axis=1)
gold_test["rougher.input.feed"] = raw_feed(gold_test)
gold_test["rougher.output.concentrate"] = rougher_conc(gold_test)
gold_test["final.output.concentrate"] = final_conc(gold_test)
gold_test = gold_test[(gold_test["rougher.input.feed"] > 20) & (gold_test["rougher.output.concentrate"] > 20) & (gold_test["final.output.concentrate"] > 20)]
gold_test = gold_test.drop(["rougher.input.feed", "rougher.output.concentrate", "final.output.concentrate"], axis=1)
gold_test = gold_test.drop(list(gold_full_merge.columns.values), axis=1)
gold_train = gold_train.loc[:, list(gold_test.columns)]
features_train = gold_train.drop(columns=["rougher.output.recovery", "final.output.recovery"], axis=1)
target_train = gold_train[["rougher.output.recovery", "final.output.recovery"]]
feature_scaler = StandardScaler()
features_train = feature_scaler.fit_transform(features_train)
def smape_cv(model, features, target):
return np.abs(np.average(cross_validate(model, features, target, scoring=smape_scorer)["test_score"]))
features_test = gold_test.drop(columns=["rougher.output.recovery", "final.output.recovery"], axis=1)
target_test = gold_test[["rougher.output.recovery", "final.output.recovery"]]
print(target_test)
features_test = feature_scaler.transform(features_test)
state = np.random.RandomState(12345)
print("Decision Tree")
for depth in range(100, 501, 100):
model = DecisionTreeRegressor(max_depth=depth, random_state=state)
model.fit(features_train, target_train)
final_smape = smape_cv(model, features_train, target_train)
print("max_depth =", depth, ":", final_smape)
print("Random Forest")
for estim in range(10, 51, 10):
model = RandomForestRegressor(n_estimators=estim, random_state=state)
model.fit(features_train, target_train)
final_smape = smape_cv(model, features_train, target_train)
print("n_estimators =", estim, ":", final_smape)
model = LinearRegression()
model.fit(features_train, target_train)
final_smape = smape_cv(model, features_train, target_train)
print("Logistic Regression", ":", final_smape)
model = RandomForestRegressor(n_estimators=40, random_state=state)
model.fit(features_train, target_train)
final_smape = smape_cv(model, features_test, target_test)
print("sMAPE :", final_smape)
constant_model_mean = target_train.mean()
constant_model_test = pd.DataFrame(index=range(len(target_test)),columns=["rougher.output.recovery", "final.output.recovery"])
constant_model_test["rougher.output.recovery"] = constant_model_mean[0]
constant_model_test["final.output.recovery"] = constant_model_mean[1]
constant_smape = final_smape(constant_model_test, target_test)
print("Constant model sMAPE:", constant_smape)
Dropped data in train and test set <= 20% concentration to remove abnormal values.
Trained and fit the model using DecisionTreeRegressor at various depths.
Trained and fit the model using RandomForestRegressor with various n_estimators.
Trained and fit the model using LinearRegression.
Trained and fit the final model using RandomForestRegressor with 40 n_estimators.
Determined the constant model prediciting the mean.
The best best model observed is using the RandomForestRegressor at 40 n_setimators. The final sMAPE found is 8%. The constant model prediciting the mean is 7.6% which is very close and indicates that the machine learning model is not significantly better than the constant model prediciting the mean. The constant model prediciting the mean is also better than any of the other models tested.