diff --git a/docs/examples/cryptocurrency-quickstart.ipynb b/docs/examples/cryptocurrency-quickstart.ipynb index e54bf0e..4d4a39a 100644 --- a/docs/examples/cryptocurrency-quickstart.ipynb +++ b/docs/examples/cryptocurrency-quickstart.ipynb @@ -63,7 +63,7 @@ "outputs": [], "source": [ "files = os.listdir(path)\n", - "files = [path + \"/\" + x for x in files]" + "files = [path+'/'+x for x in files]" ] }, { @@ -198,18 +198,18 @@ "# Read all filez and set them up to the readable structure for timecopilot\n", "for file in files:\n", " temp_df = pd.read_csv(file)\n", - " temp_df = temp_df[[\"Symbol\", \"Date\", \"Close\"]]\n", - " temp_df.columns = [\"unique_id\", \"ds\", \"y\"]\n", - " big_df = pd.concat([big_df, temp_df])\n", + " temp_df = temp_df[['Symbol','Date','Close']]\n", + " temp_df.columns = ['unique_id','ds','y']\n", + " big_df = pd.concat([big_df,temp_df])\n", "\n", "big_df = big_df.reset_index(drop=True)\n", "big_df[\"ds\"] = pd.to_datetime(big_df[\"ds\"], dayfirst=True, errors=\"coerce\")\n", "\n", - "# This line will be kept for execution time sanity, feel free to remove it if you want to stress timing a little further.\n", + "# This line will be kept for execution time sanity, feel free to remove it if you want to stress timing a little further. \n", "# big_df = big_df[big_df.ds >= \"2021-01-01\"]\n", - "cryptos = [\"MIOTA\", \"XEM\", \"ETH\", \"LTC\", \"DOGE\", \"CRO\", \"USDC\", \"ADA\"]\n", - "big_df = big_df[big_df.unique_id.isin(cryptos)]\n", - "big_df = big_df.reset_index(drop=True)\n", + "cryptos=['MIOTA','XEM','ETH','LTC','DOGE','CRO','USDC','ADA']\n", + "big_df=big_df[big_df.unique_id.isin(cryptos)]\n", + "big_df=big_df.reset_index(drop=True)\n", "big_df" ] }, @@ -341,7 +341,6 @@ " df_out.loc[idx, col] = np.nan\n", " return df_out\n", "\n", - "\n", "df_missing = add_missing(big_df, col=\"y\", frac=0.03, seed=42)\n", "df_missing = df_missing.sample(frac=1, random_state=42).reset_index(drop=True)\n", "print(df_missing)" @@ -710,14 +709,12 @@ } ], "source": [ - "anomaly_summary_xlm = anomalies_df[\n", + "anomaly_summary_xlm=anomalies_df[\n", " # (anomalies_df.unique_id=='SOL') & \\\n", - " (\n", - " (anomalies_df[\"Chronos-anomaly\"] == True)\n", - " | (anomalies_df[\"SeasonalNaive-anomaly\"] == True)\n", - " | (anomalies_df[\"Theta-anomaly\"] == True)\n", - " )\n", - "].reset_index(drop=True)\n", + " ((anomalies_df['Chronos-anomaly']==True) | \\\n", + " (anomalies_df['SeasonalNaive-anomaly']==True) |\n", + " (anomalies_df['Theta-anomaly']==True)\n", + " )].reset_index(drop=True)\n", "anomaly_summary_xlm" ] }, @@ -957,14 +954,12 @@ } ], "source": [ - "anomaly_summary_xlm = anomalies_df[\n", - " (anomalies_df.unique_id == \"ADA\")\n", - " & (\n", - " (anomalies_df[\"Chronos-anomaly\"] == True)\n", - " | (anomalies_df[\"SeasonalNaive-anomaly\"] == True)\n", - " | (anomalies_df[\"Theta-anomaly\"] == True)\n", - " )\n", - "].reset_index(drop=True)\n", + "anomaly_summary_xlm=anomalies_df[\n", + " (anomalies_df.unique_id=='ADA') & \\\n", + " ((anomalies_df['Chronos-anomaly']==True) | \\\n", + " (anomalies_df['SeasonalNaive-anomaly']==True) |\n", + " (anomalies_df['Theta-anomaly']==True)\n", + " )].reset_index(drop=True)\n", "anomaly_summary_xlm" ] }, @@ -1204,14 +1199,12 @@ } ], "source": [ - "anomaly_summary_xlm = anomalies_df[\n", - " (anomalies_df.unique_id == \"ADA\")\n", - " & (\n", - " (anomalies_df[\"Chronos-anomaly\"] == True)\n", - " & (anomalies_df[\"SeasonalNaive-anomaly\"] == True)\n", - " # (anomalies_df['Theta-anomaly']==True)\n", - " )\n", - "].reset_index(drop=True)\n", + "anomaly_summary_xlm=anomalies_df[\n", + " (anomalies_df.unique_id=='ADA') & \\\n", + " ((anomalies_df['Chronos-anomaly']==True) & \\\n", + " (anomalies_df['SeasonalNaive-anomaly']==True) \\\n", + " # (anomalies_df['Theta-anomaly']==True)\n", + " )].reset_index(drop=True)\n", "anomaly_summary_xlm" ] }, @@ -1248,12 +1241,12 @@ "source": [ "tcf1 = TimeCopilotForecaster(\n", " models=[\n", - " AutoARIMA(),\n", + " AutoARIMA(), \n", " Chronos(repo_id=\"amazon/chronos-bolt-mini\"),\n", " Theta(),\n", - " AutoETS(),\n", - " Moirai(),\n", - " Prophet(),\n", + " AutoETS(), \n", + " Moirai(), \n", + " Prophet(), \n", " SeasonalNaive(),\n", " ]\n", ")" @@ -1266,7 +1259,7 @@ "metadata": {}, "outputs": [], "source": [ - "fcst_df = tcf1.forecast(df=big_df, h=30, level=[80, 90])" + "fcst_df = tcf1.forecast(df=big_df, h=30, level=[80,90])" ] }, { @@ -1310,9 +1303,9 @@ "metadata": {}, "outputs": [], "source": [ - "eth_fcst_normal = fcst_df[(fcst_df.unique_id == \"ETH\")][\n", - " [\"unique_id\", \"ds\", \"Chronos\", \"Chronos-lo-80\"]\n", - "].reset_index(drop=True)" + "eth_fcst_normal=fcst_df[(fcst_df.unique_id=='ETH')]\\\n", + " [['unique_id','ds','Chronos','Chronos-lo-80']]\\\n", + " .reset_index(drop=True)" ] }, { @@ -1352,9 +1345,9 @@ "metadata": {}, "outputs": [], "source": [ - "eth_fcst_missing = fcst_df[(fcst_df.unique_id == \"ETH\")][\n", - " [\"unique_id\", \"ds\", \"Chronos\", \"Chronos-lo-80\"]\n", - "].reset_index(drop=True)" + "eth_fcst_missing=fcst_df[(fcst_df.unique_id=='ETH')]\\\n", + " [['unique_id','ds','Chronos','Chronos-lo-80']]\\\n", + " .reset_index(drop=True)" ] }, { @@ -1522,9 +1515,9 @@ } ], "source": [ - "compare = eth_fcst_normal.merge(eth_fcst_missing, on=[\"ds\", \"unique_id\"])\n", - "compare[\"dif\"] = abs(compare[\"Chronos_x\"] - compare[\"Chronos_y\"])\n", - "print(compare[\"dif\"].sum())" + "compare=eth_fcst_normal.merge(eth_fcst_missing,on=['ds','unique_id'])\n", + "compare['dif']=abs(compare['Chronos_x']-compare['Chronos_y'])\n", + "print(compare['dif'].sum())" ] }, { diff --git a/mkdocs.yml b/mkdocs.yml index a40294e..f2fb26c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -22,12 +22,12 @@ nav: - examples/agent-quickstart.ipynb - examples/llm-providers.ipynb - examples/aws-bedrock.ipynb - - examples/google-llms.ipynb - examples/forecaster-quickstart.ipynb - examples/anomaly-detection-forecaster-quickstart.ipynb - examples/ts-foundation-models-comparison-quickstart.ipynb - examples/gift-eval.ipynb - examples/chronos-family.ipynb + - examples/crytpocurrency-quickstart.ipynb - examples/finetuning.ipynb - examples/cryptocurrency-quickstart.ipynb - examples/sktime.ipynb @@ -47,7 +47,6 @@ nav: - api/models/ml.md - api/models/neural.md - api/models/ensembles.md - - api/models/adapters/adapters.md - api/models/utils/forecaster.md - api/gift-eval/gift-eval.md - Changelogs: @@ -76,7 +75,6 @@ nav: theme: name: "material" - custom_dir: docs/overrides logo: https://timecopilot.s3.amazonaws.com/public/logos/logo-white.svg favicon: https://timecopilot.s3.amazonaws.com/public/logos/favicon-white.svg palette: diff --git a/timecopilot/agent.py b/timecopilot/agent.py index f24e053..1018db3 100644 --- a/timecopilot/agent.py +++ b/timecopilot/agent.py @@ -314,14 +314,77 @@ def _transform_time_series_to_text(df: pd.DataFrame) -> str: return output +def _summarize_time_series_for_llm(df: pd.DataFrame, max_series: int = 50) -> str: + summaries = [] + + grouped = df.groupby("unique_id") + + for i, (uid, g) in enumerate(grouped): + if i >= max_series: + break + + y = g["y"] + + summaries.append( + { + "series": uid[:12], # shorten UUID + "n_points": int(len(g)), + "mean": round(float(y.mean()), 3), + "std": round(float(y.std()), 3), + "q25": round(float(y.quantile(0.25)), 3), + "median": round(float(y.median()), 3), + "q75": round(float(y.quantile(0.75)), 3), + "min": round(float(y.min()), 3), + "max": round(float(y.max()), 3), + } + ) + + dataset_summary = { + "n_series_total": int(df["unique_id"].nunique()), + "n_series_sampled": len(summaries), + "series_sample": summaries, + } + + return ( + "Summary statistics for a sample of the dataset time series. " + "The dataset contains many series; only a subset is shown for context. " + f"{dataset_summary}" + ) + + +# def _transform_features_to_text(features_df: pd.DataFrame) -> str: +# output = ( +# "these are the time series features in json format where the key is " +# "the identifier of the time series and the values is also a json of " +# "feature names and their values." +# f"{features_df.to_json(orient='index')}" +# ) +# return output + + def _transform_features_to_text(features_df: pd.DataFrame) -> str: - output = ( - "these are the time series features in json format where the key is " - "the identifier of the time series and the values is also a json of " - "feature names and their values." - f"{features_df.to_json(orient='index')}" + # Base on similarly named function, edited to involve the summary + + summaries = [] + + for uid, row in features_df.iterrows(): + summaries.append( + { + "series": uid, + "trend_strength": float(row.get("trend_strength", 0)), + "seasonal_strength": float(row.get("seasonal_strength", 0)), + "spikiness": float(row.get("spikiness", 0)), + "lumpiness": float(row.get("lumpiness", 0)), + "entropy": float(row.get("entropy", 0)), + "acf1": float(row.get("acf1", 0)), + } + ) + + return ( + "Key time series diagnostic features extracted from the dataset. " + "These features describe structural properties of each series. " + f"{summaries}" ) - return output def _transform_eval_to_text(eval_df: pd.DataFrame, models: list[str]) -> str: @@ -329,64 +392,136 @@ def _transform_eval_to_text(eval_df: pd.DataFrame, models: list[str]) -> str: return output +# def _transform_fcst_to_text(fcst_df: pd.DataFrame) -> str: +# df_agg = fcst_df.groupby("unique_id").agg(list) +# output = ( +# "these are the forecasted values in json format where the key is the " +# "identifier of the time series and the values is also a json of two " +# "elements: the first element is the date column and the second " +# "element is the value column." +# f"{df_agg.to_json(orient='index')}" +# ) +# return output + + def _transform_fcst_to_text(fcst_df: pd.DataFrame) -> str: - df_agg = fcst_df.groupby("unique_id").agg(list) - output = ( - "these are the forecasted values in json format where the key is the " - "identifier of the time series and the values is also a json of two " - "elements: the first element is the date column and the second " - "element is the value column." - f"{df_agg.to_json(orient='index')}" + summaries = [] + # Identically named function commented out for tokenization ease purposes. + + for uid, g in fcst_df.groupby("unique_id"): + value_cols = [c for c in g.columns if c not in ["unique_id", "ds"]] + + for col in value_cols: + y = g[col] + + summaries.append( + { + "series": uid, + "model": col, + "horizon": int(len(g)), + "start_forecast": str(g["ds"].min()), + "end_forecast": str(g["ds"].max()), + "mean_forecast": float(y.mean()), + "std_forecast": float(y.std()), + "min_forecast": float(y.min()), + "q25_forecast": float(y.quantile(0.25)), + "median_forecast": float(y.median()), + "q75_forecast": float(y.quantile(0.75)), + "max_forecast": float(y.max()), + } + ) + + return ( + "These are summary statistics of the generated forecasts. " + "Each entry corresponds to one forecast series and model. " + "The values describe the forecast distribution across the " + " horizon, not the raw predictions. " + f"{summaries}" ) - return output + + +# def _transform_anomalies_to_text(anomalies_df: pd.DataFrame) -> str: +# """Transform anomaly detection results to text for the agent.""" +# # Get anomaly columns +# anomaly_cols = [col for col in anomalies_df.columns if col.endswith("-anomaly")] + +# if not anomaly_cols: +# return "No anomaly detection results available." + +# # Count anomalies per series +# anomaly_summary = {} +# for unique_id in anomalies_df["unique_id"].unique(): +# series_data = anomalies_df[anomalies_df["unique_id"] == unique_id] +# series_summary = {} + +# for anomaly_col in anomaly_cols: +# if anomaly_col in series_data.columns: +# anomaly_count = series_data[anomaly_col].sum() +# total_points = len(series_data) +# anomaly_rate = ( +# (anomaly_count / total_points) * 100 if total_points > 0 else 0 +# ) + +# # Get timestamps of anomalies +# anomalies = series_data[series_data[anomaly_col]] +# anomaly_dates = ( +# anomalies["ds"].dt.strftime("%Y-%m-%d").tolist() +# if len(anomalies) > 0 +# else [] +# ) + +# series_summary[anomaly_col] = { +# "count": int(anomaly_count), +# "rate_percent": round(anomaly_rate, 2), +# "dates": anomaly_dates[:10], # Limit to first 10 +# "total_points": int(total_points), +# } + +# anomaly_summary[unique_id] = series_summary + +# output = ( +# "these are the anomaly detection results in json format where the key is the " +# "identifier of the time series and the values contain anomaly statistics " +# "including count, rate, and timestamps of detected anomalies. " +# f"{anomaly_summary}" +# ) +# return output def _transform_anomalies_to_text(anomalies_df: pd.DataFrame) -> str: - """Transform anomaly detection results to text for the agent.""" - # Get anomaly columns + # Creating equally named function that will summarise information regarding + # anomalies to reduce tokenization traffic towards LLM anomaly_cols = [col for col in anomalies_df.columns if col.endswith("-anomaly")] if not anomaly_cols: return "No anomaly detection results available." - # Count anomalies per series - anomaly_summary = {} - for unique_id in anomalies_df["unique_id"].unique(): - series_data = anomalies_df[anomalies_df["unique_id"] == unique_id] - series_summary = {} + summaries = [] - for anomaly_col in anomaly_cols: - if anomaly_col in series_data.columns: - anomaly_count = series_data[anomaly_col].sum() - total_points = len(series_data) - anomaly_rate = ( - (anomaly_count / total_points) * 100 if total_points > 0 else 0 - ) + for uid, g in anomalies_df.groupby("unique_id"): + total_points = int(len(g)) - # Get timestamps of anomalies - anomalies = series_data[series_data[anomaly_col]] - anomaly_dates = ( - anomalies["ds"].dt.strftime("%Y-%m-%d").tolist() - if len(anomalies) > 0 - else [] - ) + for anomaly_col in anomaly_cols: + anomaly_count = int(g[anomaly_col].sum()) + anomaly_rate = ( + (anomaly_count / total_points) * 100 if total_points > 0 else 0 + ) - series_summary[anomaly_col] = { - "count": int(anomaly_count), - "rate_percent": round(anomaly_rate, 2), - "dates": anomaly_dates[:10], # Limit to first 10 - "total_points": int(total_points), + summaries.append( + { + "series": uid, + "model": anomaly_col.replace("-anomaly", ""), + "total_points": total_points, + "anomaly_count": anomaly_count, + "anomaly_rate_pct": round(anomaly_rate, 3), } + ) - anomaly_summary[unique_id] = series_summary - - output = ( - "these are the anomaly detection results in json format where the key is the " - "identifier of the time series and the values contain anomaly statistics " - "including count, rate, and timestamps of detected anomalies. " - f"{anomaly_summary}" + return ( + "These are summary statistics of anomaly detection results. " + "Each entry reports how many anomalies were detected for a series and model. " + f"{summaries}" ) - return output def _is_sktime_forecaster(obj: object) -> bool: @@ -456,11 +591,12 @@ def __init__( if "SeasonalNaive" not in self.forecasters: self.forecasters["SeasonalNaive"] = SeasonalNaive() self.system_prompt = f""" - You're a forecasting expert. You will be given a time series - as a list of numbers and your task is to determine the best model for it. + You're a forecasting expert. You will be given summary statistics + describing one or more time series, and your task is to determine + the best forecasting model for them. You have access to the following tools: - 1. tsfeatures_tool: Calculates time series features to help + 1. tsfeatures_tool: Calculates time series features to help with model selection. Available features are: {", ".join(TSFEATURES.keys())} @@ -930,7 +1066,9 @@ async def add_experiment_info( async def add_time_series( ctx: RunContext[ExperimentDataset], ) -> str: - return _transform_time_series_to_text(ctx.deps.df) + # return _transform_time_series_to_text(ctx.deps.df) + # keeping things traceable regarding previously used function + return _summarize_time_series_for_llm(ctx.deps.df) @self.forecasting_agent.tool async def tsfeatures_tool( diff --git a/timecopilot/forecaster.py b/timecopilot/forecaster.py index a27ac5f..4c41884 100644 --- a/timecopilot/forecaster.py +++ b/timecopilot/forecaster.py @@ -96,11 +96,9 @@ def _call_models( res_df_model = fn(**known_kwargs, **kwargs) res_df_model = res_df_model.rename( columns={ - col: ( - col.replace(self.fallback_model.alias, model.alias) - if col.startswith(self.fallback_model.alias) - else col - ) + col: col.replace(self.fallback_model.alias, model.alias) + if col.startswith(self.fallback_model.alias) + else col for col in res_df_model.columns } )