Hi,👋 we have updated the app and fixed multiple bugs. We are lacking funds, request to free user not to use Adblock. Ads are non intrusive. 😊

✨ Visual Editor

close

Thread Truncated

Only the first 20 tweets are shown to ensure high-quality rendering and prevent image size issues.

palette Canvas & Background

Gradient:arrow_forward
Text Color:
135°

style Card Style

40px
16px

text_fields Typography

16px
zostaff
@zostaff
## Complete Technical Guide: From Data Collection to a Working Model
Thread image
zostaff
@zostaff
This article provides a step-by-step walkthrough of the architecture and implementation of an analytical system for predicting football match outcomes. The system uses Claude API from Anthropic as its "brain" - for data interpretation, feature engineering, and generating final predictions. The key innovation is combining three probability layers: bookmaker odds (Bet365), Polymarket prediction market data (blockchain-based crowd intelligence), and a custom ML model. The entire pipeline is written in Python using pandas, scikit-learn, XGBoost, and matplotlib.
zostaff
@zostaff
## System Architecture
zostaff
@zostaff
The system consists of several layers, each serving a specific role:
zostaff
@zostaff
┌─────────────────────────────────────────────────────────────┐
│ DATA LAYER │
│ football-data.co.uk │ API-Football │ FBref │ Kaggle │
│ │
│ ┌──────────────────────────────────────────────────┐ │
│ │ 🔗 Polymarket Gamma API (prediction market) │ │
│ │ Crowd-sourced probabilities on the Polygon chain │ │
│ └──────────────────────────────────────────────────┘ │
└──────────────────────────┬───────────────────────────────────┘


┌─────────────────────────────────────────────────────────────┐
│ PROCESSING LAYER │
│ pandas │ numpy │ data cleaning │ feature engineering │
│ │
│ ┌──────────────────────────────────────────────────┐ │
│ │ Claude API: feature generation, │ │
│ │ context analysis, statistics interpretation │ │
│ └──────────────────────────────────────────────────┘ │
│ │
│ ┌──────────────────────────────────────────────────┐ │
│ │ Merging 3 probability layers: │ │
│ │ Bookmaker odds + Polymarket prices + ML model │ │
│ └──────────────────────────────────────────────────┘ │
└──────────────────────────┬───────────────────────────────────┘


┌─────────────────────────────────────────────────────────────┐
│ MODEL LAYER │
│ Logistic Regression │ Random Forest │ XGBoost │
│ Ensemble (Voting / Stacking) │
└──────────────────────────┬───────────────────────────────────┘


┌─────────────────────────────────────────────────────────────┐
│ INTERPRETATION LAYER │
│ Claude API: natural language prediction explanation │
│ + confidence assessment + divergence analysis │
│ between bookmaker / Polymarket / ML │
└──────────────────────────┬───────────────────────────────────┘


┌─────────────────────────────────────────────────────────────┐
│ OUTPUT LAYER │
│ matplotlib visualizations │ JSON reports │ Telegram bot │
└─────────────────────────────────────────────────────────────┘
zostaff
@zostaff
Required Dependencies
zostaff
@zostaff
# requirements.txt
anthropic>=0.40.0
pandas>=2.1.0
numpy>=1.24.0
scikit-learn>=1.3.0
xgboost>=2.0.0
matplotlib>=3.8.0
seaborn>=0.13.0
requests>=2.31.0
python-dotenv>=1.0.0
schedule>=1.2.0 # pipeline automation
zostaff
@zostaff
Installation:
zostaff
@zostaff
pip install anthropic pandas numpy scikit-learn xgboost matplotlib seaborn requests python-dotenv schedule
zostaff
@zostaff
The Polymarket Gamma API does not require a dedicated SDK - all requests are made via `requests` to public REST endpoints without authentication.
zostaff
@zostaff
## Data Collection and Preparation
zostaff
@zostaff
The primary data source is football-data.co.uk, which provides CSV files with match results and statistics for all major European leagues. The data includes goals, shots, corners, fouls, cards, and bookmaker odds.
zostaff
@zostaff
Data Loading
zostaff
@zostaff
import pandas as pd
import numpy as np
from pathlib import Path

class FootballDataLoader:
"""
Historical football match data loader.
Source: football-data.co.uk
"""

BASE_URL = "https://www.football-data.co.uk/mmz4281"

LEAGUES = {
"E0": "Premier League",
"SP1": "La Liga",
"D1": "Bundesliga",
"I1": "Serie A",
"F1": "Ligue 1",
}

COLUMNS_TO_KEEP = [
"Date", "HomeTeam", "AwayTeam",
"FTHG", "FTAG", "FTR", # Final score and result
"HTHG", "HTAG", "HTR", # Half-time score
"HS", "AS", # Shots
"HST", "AST", # Shots on target
"HF", "AF", # Fouls
"HC", "AC", # Corners
"HY", "AY", # Yellow cards
"HR", "AR", # Red cards
"B365H", "B365D", "B365A", # Bet365 odds
]

def __init__(self, seasons: list[str], leagues: list[str] = None):
self.seasons = seasons # format: ["2324", "2223", "2122"]
self.leagues = leagues or list(self.LEAGUES.keys())

def load_season(self, league: str, season: str) -> pd.DataFrame:
"""Load data for a single season and league."""
url = f"{self.BASE_URL}/{season}/{league}.csv"
try:
df = pd.read_csv(url, encoding="utf-8", on_bad_lines="skip")
available_cols = [c for c in self.COLUMNS_TO_KEEP if c in df.columns]
df = df[available_cols].dropna(subset=["HomeTeam", "AwayTeam", "FTR"])
df["League"] = self.LEAGUES.get(league, league)
df["Season"] = season
return df
except Exception as e:
print(f"Error loading {league}/{season}: {e}")
return pd.DataFrame()

def load_all(self) -> pd.DataFrame:
"""Load all data for specified leagues and seasons."""
frames = []
for league in self.leagues:
for season in self.seasons:
df = self.load_season(league, season)
if not df.empty:
frames.append(df)
print(f" ✓ {self.LEAGUES.get(league)}, season {season}: "
f"{len(df)} matches")
result = pd.concat(frames, ignore_index=True)
print(f"\nTotal loaded: {len(result)} matches")
return result


# === Usage ===
loader = FootballDataLoader(
seasons=["2425", "2324", "2223", "2122", "2021"],
leagues=["E0", "SP1", "D1"] # EPL, La Liga, Bundesliga
)
raw_data = loader.load_all()
zostaff
@zostaff
Cleaning and Transformation
zostaff
@zostaff
class DataCleaner:
"""Data cleaning and standardization."""

@staticmethod
def clean(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()

# Standardize date format
df["Date"] = pd.to_datetime(df["Date"], dayfirst=True, errors="coerce")
df = df.dropna(subset=["Date"])
df = df.sort_values("Date").reset_index(drop=True)

# Numeric columns
numeric_cols = [
"FTHG", "FTAG", "HTHG", "HTAG",
"HS", "AS", "HST", "AST",
"HF", "AF", "HC", "AC",
"HY", "AY", "HR", "AR",
"B365H", "B365D", "B365A",
]
for col in numeric_cols:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors="coerce")

# Encoding result: H=2, D=1, A=0
result_map = {"H": 2, "D": 1, "A": 0}
df["Result"] = df["FTR"].map(result_map)
df = df.dropna(subset=["Result"])
df["Result"] = df["Result"].astype(int)

return df


clean_data = DataCleaner.clean(raw_data)
print(f"After cleaning: {len(clean_data)} matches")
print(f"Result distribution:\n{clean_data['FTR'].value_counts()}")
zostaff
@zostaff
## Feature Engineering with Claude
zostaff
@zostaff
This is the key stage where we create features that enable the model to "understand" the match context. Here, Claude serves as an intelligent assistant - helping generate feature ideas and evaluate contextual factors.
zostaff
@zostaff
Statistical Features (Rolling Averages)
zostaff
@zostaff
class FeatureEngineer:
"""
Feature generation based on historical team statistics.
Key idea: for each match we use ONLY data
available BEFORE the match starts.
"""

def __init__(self, window: int = 5):
self.window = window

def compute_team_stats(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Compute rolling averages for each team
over the last N matches.
"""
df = df.sort_values("Date").copy()

# Create separate records for home and away teams
home_records = df[["Date", "HomeTeam", "FTHG", "FTAG",
"HS", "AS", "HST", "AST",
"HC", "AC", "HF", "AF"]].copy()
home_records.columns = ["Date", "Team", "GF", "GA",
"Shots", "ShotsAgainst",
"SoT", "SoTAgainst",
"Corners", "CornersAgainst",
"Fouls", "FoulsAgainst"]
home_records["IsHome"] = 1

away_records = df[["Date", "AwayTeam", "FTAG", "FTHG",
"AS", "HS", "AST", "HST",
"AC", "HC", "AF", "HF"]].copy()
away_records.columns = home_records.columns
away_records["IsHome"] = 0

all_records = pd.concat([home_records, away_records])
all_records = all_records.sort_values("Date")

# Calculate rolling averages per team
stats_cols = ["GF", "GA", "Shots", "ShotsAgainst",
"SoT", "SoTAgainst", "Corners",
"CornersAgainst", "Fouls", "FoulsAgainst"]

rolling_stats = {}
for team in all_records["Team"].unique():
team_data = all_records[all_records["Team"] == team].copy()
for col in stats_cols:
# shift(1) — to exclude the current match
team_data[f"avg_{col}"] = (
team_data[col]
.shift(1)
.rolling(window=self.window, min_periods=3)
.mean()
)
# Form: average points over last N matches
team_data["Points"] = team_data.apply(
lambda r: 3 if r["GF"] > r["GA"]
else (1 if r["GF"] == r["GA"] else 0),
axis=1,
)
team_data["Form"] = (
team_data["Points"]
.shift(1)
.rolling(window=self.window, min_periods=3)
.mean()
)
rolling_stats[team] = team_data

return pd.concat(rolling_stats.values())

def build_match_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Join home and away team statistics
for each match.
"""
team_stats = self.compute_team_stats(df)

stat_features = [c for c in team_stats.columns if c.startswith("avg_")]
stat_features.append("Form")

features_list = []

for idx, match in df.iterrows():
home = match["HomeTeam"]
away = match["AwayTeam"]
date = match["Date"]

home_stats = team_stats[
(team_stats["Team"] == home) &
(team_stats["Date"] == date) &
(team_stats["IsHome"] == 1)
]
away_stats = team_stats[
(team_stats["Team"] == away) &
(team_stats["Date"] == date) &
(team_stats["IsHome"] == 0)
]

if home_stats.empty or away_stats.empty:
continue

row = {"match_idx": idx}
for feat in stat_features:
h_val = home_stats[feat].values[0]
a_val = away_stats[feat].values[0]
row[f"home_{feat}"] = h_val
row[f"away_{feat}"] = a_val
# Difference — one of the strongest features
row[f"diff_{feat}"] = h_val - a_val

features_list.append(row)

features_df = pd.DataFrame(features_list).set_index("match_idx")
result = df.join(features_df, how="inner")
return result.dropna(subset=[c for c in features_df.columns])


# === Usage ===
engineer = FeatureEngineer(window=5)
featured_data = engineer.build_match_features(clean_data)
print(f"Matches with features: {len(featured_data)}")
print(f"Number of features: {len([c for c in featured_data.columns if c.startswith(('home_', 'away_', 'diff_'))])}")
Generated by Thread Navigator
100%
view_carousel Carousel Studio NEW
Press + S to quick-export