I hope others will share their code, since there are many more out there who are much better than me at this, but after a lot of work and even more frustration, I managed to put together a draft of a Python ML code that works on P123 Data.
I use it in Visual Studio and have received some assistance from Cody, Tabnine, and BlackBox.
The code is supposed to do this (I don't have enough programming experience to know if this is what it actually does, And the stocks recommended based on the trained models surprised me. ):
- Imports necessary libraries and modules.
- Defines the
main()
function. - Loads the dataset from a CSV file using Dask DataFrame.
- Performs data preprocessing:
- Converts the 'Date' column to datetime format.
- Sorts the data by the 'Date' column.
- Performs feature engineering by extracting year, month, and day from the 'Date' column.
- Removes rows with missing values.
- Checks if the 'Price' column exists in the dataset. If not, it uses the first numeric column as the target.
- Defines the training and testing window sizes (in years).
- Defines the path to the folder where the trained models will be saved.
- Tries to load previously saved results from a pickle file. If the file doesn't exist, it performs walk-forward validation:
- Iterates over different start years for the training and testing periods.
- Filters the training and testing data based on the start year and window sizes.
- Checks if there is enough data for the current training and testing periods.
- Defines the features (X) and target (y) for training and testing data.
- Converts Dask DataFrames to Pandas DataFrames.
- Splits the training data into train and validation sets.
- Defines the hyperparameter grids for RandomizedSearchCV.
- Defines the machine learning models (RandomForest, ExtraTrees, XGBoost) with RandomizedSearchCV.
- Creates the model directory if it doesn't exist.
- Trains and evaluates the models on the train and validation sets.
- Calculates and stores the mean squared error (MSE) for each model.
- Saves the best estimator for each model to disk.
- Defines an ensemble model (VotingRegressor) using the best estimators from the individual models.
- Trains and evaluates the ensemble model on the train and validation sets.
- Calculates and stores the MSE for the ensemble model.
- Saves the ensemble model to disk.
- Saves the results (MSE for each model and period) to a pickle file.
- Prints the model evaluation results (average MSE over all periods) for each model.
- Finds the best model based on the lowest average MSE.
- Finds the last start year for which the best model was trained.
- Loads the best model from disk.
- If the best model is loaded successfully:
- Computes the predictions for the entire dataset using the best model.
- Converts the predictions to a Pandas DataFrame.
- Adds the 'Ticker' column to the predictions DataFrame.
- Sorts the predictions DataFrame by the predicted values in descending order.
- Prints the top 10 recommended stocks based on the model predictions.
- If no model was trained, it prints a message indicating that.
- Calls the
main()
function if the script is run directly.
import dask.dataframe as dd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import joblib
import pandas as pd
import pickle
import os
import os.path
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import VotingRegressor
def main():
# Last inn datasettet med dask
file_path = r'C:\XXXX\020624RAA.csv'
data = dd.read_csv(file_path)
# Forbehandling
# Anta at 'Date' er kolonnen med datoer
data['Date'] = dd.to_datetime(data['Date'])
data = data.sort_values('Date')
# Funksjonsengineering
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day
# Fjern rader med manglende verdier hvis det er noen
data = data.dropna()
# Sjekk om 'Price'-kolonnen finnes
target_column = 'Price'
if target_column not in data.columns:
print(f"Warning: Target column '{target_column}' not found in dataset. Using the first numeric column as the target.")
numeric_columns = [col for col in data.columns if data[col].dtype.kind in 'bifc']
if len(numeric_columns) > 0:
target_column = numeric_columns[0]
else:
print("No numeric columns found in the dataset. Exiting.")
return
# Definer trenings- og testperioder
train_window = 3 # Antall år for treningsvindu
test_window = 3 # Antall år for testvindu
# Definer banen til mappen hvor modellene skal lagres
modell_mappe = r'C:\XXX\MAKSKINL LAGRING\LAGREDE MODELLER'
# Prøv å laste inn lagrede resultater
try:
with open('walk_forward_results.pkl', 'rb') as f:
resultater = pickle.load(f)
except FileNotFoundError:
# Hvis filen ikke finnes, kjør walk-forward validering
resultater = {}
# Walk-forward validering
for start_år in range(2001, 2022 - train_window - test_window + 1):
train_start_date = f"{start_år}-01-01"
train_end_date = f"{start_år + train_window}-12-31"
test_start_date = f"{start_år + train_window + 1}-01-01"
test_end_date = f"{start_år + train_window + test_window}-12-31"
# Filtrer treningsdata
train_data = data[(data['Date'] >= train_start_date) & (data['Date'] <= train_end_date)]
# Filtrer testdata
test_data = data[(data['Date'] >= test_start_date) & (data['Date'] <= test_end_date)]
# Sjekk om du har nok data i trenings- og testdataene
if len(train_data) == 0 or len(test_data) == 0:
print(f"Ikke nok data i trenings- eller testdataene for periodene {train_start_date} - {train_end_date} og {test_start_date} - {test_end_date}. Hopper over denne iterasjonen.")
continue
# Definer X og y for trenings- og testdata
X_train = train_data[['Year', 'Month', 'Day']]
y_train = train_data[target_column]
X_test = test_data[['Year', 'Month', 'Day']]
y_test = test_data[target_column]
# Konverter til Pandas DataFrames
X_train_pd = X_train.compute()
X_test_pd = X_test.compute()
y_train_pd = y_train.compute()
y_test_pd = y_test.compute()
# Del datasettet
X_train_pd, X_val_pd, y_train_pd, y_val_pd = train_test_split(X_train_pd, y_train_pd, test_size=0.2, random_state=42)
# Definer hyperparameter-rom for RandomizedSearchCV
rf_param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 5, 10],
'min_samples_split': [2, 5, 10]
}
et_param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 5, 10],
'min_samples_split': [2, 5, 10]
}
xgb_param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 5, 7],
'learning_rate': [0.01, 0.05, 0.1]
}
# Definer modeller
modeller = {
'RandomForest': RandomizedSearchCV(estimator=RandomForestRegressor(random_state=42), param_distributions=rf_param_grid, n_iter=10, cv=5, verbose=2, random_state=42, n_jobs=-1),
'ExtraTrees': RandomizedSearchCV(estimator=ExtraTreesRegressor(random_state=42), param_distributions=et_param_grid, n_iter=10, cv=5, verbose=2, random_state=42, n_jobs=-1),
'XGBoost': RandomizedSearchCV(estimator=XGBRegressor(random_state=42, tree_method='hist'), param_distributions=xgb_param_grid, n_iter=10, cv=5, verbose=2, random_state=42, n_jobs=-1)
}
# Opprett mappen hvis den ikke eksisterer
if not os.path.exists(modell_mappe):
os.makedirs(modell_mappe)
# Tren og evaluer modeller
for navn, modell in modeller.items():
print(f"Trener {navn} for perioden {train_start_date} - {train_end_date}...")
modell.fit(X_train_pd, y_train_pd)
prediksjoner = modell.best_estimator_.predict(X_val_pd)
mse = mean_squared_error(y_val_pd, prediksjoner)
if navn not in resultater:
resultater[navn] = []
resultater[navn].append(mse)
print(f"{navn} MSE: {mse}")
# Lagre modellen
joblib.dump(modell.best_estimator_, os.path.join(modell_mappe, f'{navn}_modell_{start_år}.pkl'))
# Ensemble-metoder
ensemble_modeller = {
'VotingRegressor': VotingRegressor([
('rf', modeller['RandomForest'].best_estimator_),
('et', modeller['ExtraTrees'].best_estimator_),
('xgb', modeller['XGBoost'].best_estimator_)
])
}
# Tren og evaluer ensemble-modeller
for navn, modell in ensemble_modeller.items():
print(f"Trener {navn} for perioden {train_start_date} - {train_end_date}...")
modell.fit(X_train_pd, y_train_pd)
prediksjoner = modell.predict(X_val_pd)
mse = mean_squared_error(y_val_pd, prediksjoner)
if navn not in resultater:
resultater[navn] = []
resultater[navn].append(mse)
print(f"{navn} MSE: {mse}")
# Lagre modellen
joblib.dump(modell, os.path.join(modell_mappe, f'{navn}_modell_{start_år}.pkl'))
# Lagre resultatene
with open('walk_forward_results.pkl', 'wb') as f:
pickle.dump(resultater, f)
# Print resultater
print("\nModell evalueringsresultater (gjennomsnittlig MSE over alle perioder):")
for navn, mse_list in resultater.items():
gjennomsnitt_mse = sum(mse_list) / len(mse_list)
print(f"{navn}: Gjennomsnittlig MSE = {gjennomsnitt_mse}")
print(f"Innholdet av resultater: {resultater}")
# Finn den beste modellen basert på gjennomsnittlig MSE
beste_modell_navn = min(resultater, key=lambda x: sum(resultater[x]) / len(resultater[x]))
print(f"Den beste modellen er: {beste_modell_navn}")
# Finn siste start år for den beste modellen
siste_start_år = None
for start_år in range(2001, 2022 - train_window - test_window + 1):
modell_filnavn = os.path.join(modell_mappe, f'{beste_modell_navn}_modell_{start_år}.pkl')
if os.path.exists(modell_filnavn):
siste_start_år = start_år
# Last inn den beste modellen
beste_modell = None
if siste_start_år is not None:
modell_filnavn = os.path.join(modell_mappe, f'{beste_modell_navn}_modell_{siste_start_år}.pkl')
print(f"Laster inn den lagrede modellen {modell_filnavn}")
beste_modell = joblib.load(modell_filnavn)
else:
print("Ingen lagret modell ble funnet.")
# Bruk den beste modellen til å predikere på hele datasettet
if beste_modell is not None:
print("Laster inn den beste modellen.")
X_full = data[['Year', 'Month', 'Day']].compute()
y_pred = beste_modell.predict(X_full)
# Konverter prediksjoner til Pandas DataFrame
y_pred_df = pd.DataFrame(y_pred, columns=['Predicted'])
# Legg til 'Ticker'-kolonnen
ticker_series = data['Ticker'].compute().reset_index(drop=True)
y_pred_df['Ticker'] = ticker_series
# Sorter etter predikert verdi
y_pred_df = y_pred_df.sort_values('Predicted', ascending=False)
# Skriv ut de 10 mest anbefalte aksjene
print("\nDe 10 mest anbefalte aksjene basert på modelltreningen:")
print(y_pred_df[['Ticker', 'Predicted']].head(10))
else:
print("Ingen modell ble trent.")
if __name__ == "__main__":
main()