## Complete Technical Guide: From Data Collection to a Working Model

This article provides a step-by-step walkthrough of the architecture and implementation of an analytical system for predicting football match outcomes. The system uses Claude API from Anthropic as its "brain" - for data interpretation, feature engineering, and generating final predictions. The key innovation is combining three probability layers: bookmaker odds (Bet365), Polymarket prediction market data (blockchain-based crowd intelligence), and a custom ML model. The entire pipeline is written in Python using pandas, scikit-learn, XGBoost, and matplotlib.
## System Architecture
The system consists of several layers, each serving a specific role:
┌─────────────────────────────────────────────────────────────┐
│ DATA LAYER │
│ football-data.co.uk │ API-Football │ FBref │ Kaggle │
│ │
│ ┌──────────────────────────────────────────────────┐ │
│ │ 🔗 Polymarket Gamma API (prediction market) │ │
│ │ Crowd-sourced probabilities on the Polygon chain │ │
│ └──────────────────────────────────────────────────┘ │
└──────────────────────────┬───────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────┐
│ PROCESSING LAYER │
│ pandas │ numpy │ data cleaning │ feature engineering │
│ │
│ ┌──────────────────────────────────────────────────┐ │
│ │ Claude API: feature generation, │ │
│ │ context analysis, statistics interpretation │ │
│ └──────────────────────────────────────────────────┘ │
│ │
│ ┌──────────────────────────────────────────────────┐ │
│ │ Merging 3 probability layers: │ │
│ │ Bookmaker odds + Polymarket prices + ML model │ │
│ └──────────────────────────────────────────────────┘ │
└──────────────────────────┬───────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────┐
│ MODEL LAYER │
│ Logistic Regression │ Random Forest │ XGBoost │
│ Ensemble (Voting / Stacking) │
└──────────────────────────┬───────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────┐
│ INTERPRETATION LAYER │
│ Claude API: natural language prediction explanation │
│ + confidence assessment + divergence analysis │
│ between bookmaker / Polymarket / ML │
└──────────────────────────┬───────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────┐
│ OUTPUT LAYER │
│ matplotlib visualizations │ JSON reports │ Telegram bot │
└─────────────────────────────────────────────────────────────┘Required Dependencies
# requirements.txt
anthropic>=0.40.0
pandas>=2.1.0
numpy>=1.24.0
scikit-learn>=1.3.0
xgboost>=2.0.0
matplotlib>=3.8.0
seaborn>=0.13.0
requests>=2.31.0
python-dotenv>=1.0.0
schedule>=1.2.0 # pipeline automationInstallation:
pip install anthropic pandas numpy scikit-learn xgboost matplotlib seaborn requests python-dotenv scheduleThe Polymarket Gamma API does not require a dedicated SDK - all requests are made via `requests` to public REST endpoints without authentication.
## Data Collection and Preparation
The primary data source is football-data.co.uk, which provides CSV files with match results and statistics for all major European leagues. The data includes goals, shots, corners, fouls, cards, and bookmaker odds.
Data Loading
import pandas as pd
import numpy as np
from pathlib import Path
class FootballDataLoader:
"""
Historical football match data loader.
Source: football-data.co.uk
"""
BASE_URL = "https://www.football-data.co.uk/mmz4281"
LEAGUES = {
"E0": "Premier League",
"SP1": "La Liga",
"D1": "Bundesliga",
"I1": "Serie A",
"F1": "Ligue 1",
}
COLUMNS_TO_KEEP = [
"Date", "HomeTeam", "AwayTeam",
"FTHG", "FTAG", "FTR", # Final score and result
"HTHG", "HTAG", "HTR", # Half-time score
"HS", "AS", # Shots
"HST", "AST", # Shots on target
"HF", "AF", # Fouls
"HC", "AC", # Corners
"HY", "AY", # Yellow cards
"HR", "AR", # Red cards
"B365H", "B365D", "B365A", # Bet365 odds
]
def __init__(self, seasons: list[str], leagues: list[str] = None):
self.seasons = seasons # format: ["2324", "2223", "2122"]
self.leagues = leagues or list(self.LEAGUES.keys())
def load_season(self, league: str, season: str) -> pd.DataFrame:
"""Load data for a single season and league."""
url = f"{self.BASE_URL}/{season}/{league}.csv"
try:
df = pd.read_csv(url, encoding="utf-8", on_bad_lines="skip")
available_cols = [c for c in self.COLUMNS_TO_KEEP if c in df.columns]
df = df[available_cols].dropna(subset=["HomeTeam", "AwayTeam", "FTR"])
df["League"] = self.LEAGUES.get(league, league)
df["Season"] = season
return df
except Exception as e:
print(f"Error loading {league}/{season}: {e}")
return pd.DataFrame()
def load_all(self) -> pd.DataFrame:
"""Load all data for specified leagues and seasons."""
frames = []
for league in self.leagues:
for season in self.seasons:
df = self.load_season(league, season)
if not df.empty:
frames.append(df)
print(f" ✓ {self.LEAGUES.get(league)}, season {season}: "
f"{len(df)} matches")
result = pd.concat(frames, ignore_index=True)
print(f"\nTotal loaded: {len(result)} matches")
return result
# === Usage ===
loader = FootballDataLoader(
seasons=["2425", "2324", "2223", "2122", "2021"],
leagues=["E0", "SP1", "D1"] # EPL, La Liga, Bundesliga
)
raw_data = loader.load_all()Cleaning and Transformation
class DataCleaner:
"""Data cleaning and standardization."""
@staticmethod
def clean(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
# Standardize date format
df["Date"] = pd.to_datetime(df["Date"], dayfirst=True, errors="coerce")
df = df.dropna(subset=["Date"])
df = df.sort_values("Date").reset_index(drop=True)
# Numeric columns
numeric_cols = [
"FTHG", "FTAG", "HTHG", "HTAG",
"HS", "AS", "HST", "AST",
"HF", "AF", "HC", "AC",
"HY", "AY", "HR", "AR",
"B365H", "B365D", "B365A",
]
for col in numeric_cols:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors="coerce")
# Encoding result: H=2, D=1, A=0
result_map = {"H": 2, "D": 1, "A": 0}
df["Result"] = df["FTR"].map(result_map)
df = df.dropna(subset=["Result"])
df["Result"] = df["Result"].astype(int)
return df
clean_data = DataCleaner.clean(raw_data)
print(f"After cleaning: {len(clean_data)} matches")
print(f"Result distribution:\n{clean_data['FTR'].value_counts()}")## Feature Engineering with Claude
This is the key stage where we create features that enable the model to "understand" the match context. Here, Claude serves as an intelligent assistant - helping generate feature ideas and evaluate contextual factors.
Statistical Features (Rolling Averages)
class FeatureEngineer:
"""
Feature generation based on historical team statistics.
Key idea: for each match we use ONLY data
available BEFORE the match starts.
"""
def __init__(self, window: int = 5):
self.window = window
def compute_team_stats(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Compute rolling averages for each team
over the last N matches.
"""
df = df.sort_values("Date").copy()
# Create separate records for home and away teams
home_records = df[["Date", "HomeTeam", "FTHG", "FTAG",
"HS", "AS", "HST", "AST",
"HC", "AC", "HF", "AF"]].copy()
home_records.columns = ["Date", "Team", "GF", "GA",
"Shots", "ShotsAgainst",
"SoT", "SoTAgainst",
"Corners", "CornersAgainst",
"Fouls", "FoulsAgainst"]
home_records["IsHome"] = 1
away_records = df[["Date", "AwayTeam", "FTAG", "FTHG",
"AS", "HS", "AST", "HST",
"AC", "HC", "AF", "HF"]].copy()
away_records.columns = home_records.columns
away_records["IsHome"] = 0
all_records = pd.concat([home_records, away_records])
all_records = all_records.sort_values("Date")
# Calculate rolling averages per team
stats_cols = ["GF", "GA", "Shots", "ShotsAgainst",
"SoT", "SoTAgainst", "Corners",
"CornersAgainst", "Fouls", "FoulsAgainst"]
rolling_stats = {}
for team in all_records["Team"].unique():
team_data = all_records[all_records["Team"] == team].copy()
for col in stats_cols:
# shift(1) — to exclude the current match
team_data[f"avg_{col}"] = (
team_data[col]
.shift(1)
.rolling(window=self.window, min_periods=3)
.mean()
)
# Form: average points over last N matches
team_data["Points"] = team_data.apply(
lambda r: 3 if r["GF"] > r["GA"]
else (1 if r["GF"] == r["GA"] else 0),
axis=1,
)
team_data["Form"] = (
team_data["Points"]
.shift(1)
.rolling(window=self.window, min_periods=3)
.mean()
)
rolling_stats[team] = team_data
return pd.concat(rolling_stats.values())
def build_match_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Join home and away team statistics
for each match.
"""
team_stats = self.compute_team_stats(df)
stat_features = [c for c in team_stats.columns if c.startswith("avg_")]
stat_features.append("Form")
features_list = []
for idx, match in df.iterrows():
home = match["HomeTeam"]
away = match["AwayTeam"]
date = match["Date"]
home_stats = team_stats[
(team_stats["Team"] == home) &
(team_stats["Date"] == date) &
(team_stats["IsHome"] == 1)
]
away_stats = team_stats[
(team_stats["Team"] == away) &
(team_stats["Date"] == date) &
(team_stats["IsHome"] == 0)
]
if home_stats.empty or away_stats.empty:
continue
row = {"match_idx": idx}
for feat in stat_features:
h_val = home_stats[feat].values[0]
a_val = away_stats[feat].values[0]
row[f"home_{feat}"] = h_val
row[f"away_{feat}"] = a_val
# Difference — one of the strongest features
row[f"diff_{feat}"] = h_val - a_val
features_list.append(row)
features_df = pd.DataFrame(features_list).set_index("match_idx")
result = df.join(features_df, how="inner")
return result.dropna(subset=[c for c in features_df.columns])
# === Usage ===
engineer = FeatureEngineer(window=5)
featured_data = engineer.build_match_features(clean_data)
print(f"Matches with features: {len(featured_data)}")
print(f"Number of features: {len([c for c in featured_data.columns if c.startswith(('home_', 'away_', 'diff_'))])}")
Generated by Thread Navigator
Press ⌘ + S to quick-export
