""" Data Preparation Script: Merges code generation, tool-calling, and agentic datasets into a unified ChatML format for SFT training. Datasets used (all verified on HF Hub): 1. Team-ACE/ToolACE — 26K high-quality tool-calling examples (ShareGPT format) 2. Salesforce/APIGen-MT-5k — 5K multi-turn agentic tool-use trajectories 3. ise-uiuc/Magicoder-OSS-Instruct-75K — 75K code generation (Python-focused) 4. xingyaoww/code-act — 7K code-as-action + 69K general conversation Output: A single dataset with "messages" column in ChatML format, pushed to the Hub for training. Usage: # Full run (pushes to Hub) python prepare_data.py --output_repo your-username/code-toolcall-sft-data # Test with small sample python prepare_data.py --max_per_source 100 --dry_run """ import json import os from datasets import load_dataset, Dataset HF_TOKEN = os.environ.get("HF_TOKEN") def convert_toolace(max_samples=None): print("Loading Team-ACE/ToolACE...") ds = load_dataset("Team-ACE/ToolACE", split="train") if max_samples: ds = ds.select(range(min(max_samples, len(ds)))) role_map = {"human": "user", "user": "user", "assistant": "assistant", "gpt": "assistant", "system": "system", "tool": "tool"} def convert(example): messages = [] if example.get("system"): messages.append({"role": "system", "content": example["system"]}) for msg in example["conversations"]: role = role_map.get(msg.get("from", ""), msg.get("from", "user")) content = msg.get("value", "") messages.append({"role": role, "content": content}) return {"messages": messages, "source": "toolace"} ds = ds.map(convert, remove_columns=ds.column_names) print(f" ToolACE: {len(ds)} examples") return ds def convert_apigen_mt(max_samples=None): print("Loading Salesforce/APIGen-MT-5k...") ds = load_dataset("Salesforce/APIGen-MT-5k", "dataset", split="train") if max_samples: ds = ds.select(range(min(max_samples, len(ds)))) role_map = {"human": "user", "user": "user", "assistant": "assistant", "gpt": "assistant", "system": "system", "tool": "tool"} def convert(example): messages = [] system_content = example.get("system", "") tools = example.get("tools", []) if tools: tools_str = json.dumps(tools, indent=2) if isinstance(tools, list) else str(tools) system_content += f"\n\nAvailable tools:\n{tools_str}" if system_content: messages.append({"role": "system", "content": system_content}) for msg in example["conversations"]: role = role_map.get(msg.get("from", ""), msg.get("from", "user")) content = msg.get("value", "") messages.append({"role": role, "content": content}) return {"messages": messages, "source": "apigen_mt"} ds = ds.map(convert, remove_columns=ds.column_names) print(f" APIGen-MT: {len(ds)} examples") return ds def convert_magicoder(max_samples=None): print("Loading ise-uiuc/Magicoder-OSS-Instruct-75K...") ds = load_dataset("ise-uiuc/Magicoder-OSS-Instruct-75K", split="train") ds = ds.filter(lambda x: x.get("lang", "").lower() == "python") if max_samples: ds = ds.select(range(min(max_samples, len(ds)))) def convert(example): messages = [ {"role": "system", "content": "You are an expert Python programmer. Write clean, well-documented code with proper error handling."}, {"role": "user", "content": example["problem"]}, {"role": "assistant", "content": example["solution"]}, ] return {"messages": messages, "source": "magicoder"} ds = ds.map(convert, remove_columns=ds.column_names) print(f" Magicoder (Python): {len(ds)} examples") return ds def convert_codeact(max_samples=None): print("Loading xingyaoww/code-act (codeact split)...") ds = load_dataset("xingyaoww/code-act", split="codeact") if max_samples: ds = ds.select(range(min(max_samples, len(ds)))) role_map = {"human": "user", "user": "user", "assistant": "assistant", "gpt": "assistant", "system": "system", "tool": "tool"} def convert(example): messages = [] for msg in example["conversations"]: role = msg.get("role", msg.get("from", "user")) role = role_map.get(role, role) content = msg.get("content", msg.get("value", "")) messages.append({"role": role, "content": content}) return {"messages": messages, "source": "codeact"} ds = ds.map(convert, remove_columns=ds.column_names) print(f" CodeAct: {len(ds)} examples") return ds def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--output_repo", type=str, default="your-username/code-toolcall-sft-data") parser.add_argument("--max_per_source", type=int, default=None) parser.add_argument("--dry_run", action="store_true") args = parser.parse_args() max_s = args.max_per_source toolace = convert_toolace(max_samples=max_s) apigen = convert_apigen_mt(max_samples=max_s) magicoder = convert_magicoder(max_samples=max_s) codeact = convert_codeact(max_samples=max_s) all_datasets = [toolace, apigen, magicoder, codeact] def normalize_and_extract(ds): rows = [] for example in ds: normalized = [] for msg in example["messages"]: role = msg.get("role", "user") or "user" content = msg.get("content", "") or "" tool_calls = msg.get("tool_calls", None) if tool_calls: tool_calls_str = json.dumps(tool_calls, indent=2) content += f"\n\n\n{tool_calls_str}\n" if content else f"\n{tool_calls_str}\n" name = msg.get("name", None) if role == "tool" and name: content = f"[Tool: {name}]\n{content}" normalized.append({"role": str(role), "content": str(content)}) source = example.get("source", "unknown") rows.append({"messages": normalized, "source": source}) return rows all_rows = [] for ds in all_datasets: all_rows.extend(normalize_and_extract(ds)) merged = Dataset.from_list(all_rows) merged = merged.shuffle(seed=42) print(f"\nTotal: {len(merged)} examples") if not args.dry_run: merged.push_to_hub(args.output_repo, token=HF_TOKEN) print(f"Pushed to {args.output_repo}") else: print("[DRY RUN]") if __name__ == "__main__": main()