| """ |
| Data Preparation Script: Merges code generation, tool-calling, and agentic datasets |
| into a unified ChatML format for SFT training. |
| |
| Datasets used (all verified on HF Hub): |
| 1. Team-ACE/ToolACE — 26K high-quality tool-calling examples (ShareGPT format) |
| 2. Salesforce/APIGen-MT-5k — 5K multi-turn agentic tool-use trajectories |
| 3. ise-uiuc/Magicoder-OSS-Instruct-75K — 75K code generation (Python-focused) |
| 4. xingyaoww/code-act — 7K code-as-action + 69K general conversation |
| |
| Output: A single dataset with "messages" column in ChatML format, |
| pushed to the Hub for training. |
| |
| Usage: |
| # Full run (pushes to Hub) |
| python prepare_data.py --output_repo your-username/code-toolcall-sft-data |
| |
| # Test with small sample |
| python prepare_data.py --max_per_source 100 --dry_run |
| """ |
|
|
| import json |
| import os |
| from datasets import load_dataset, Dataset |
|
|
| HF_TOKEN = os.environ.get("HF_TOKEN") |
|
|
|
|
| def convert_toolace(max_samples=None): |
| print("Loading Team-ACE/ToolACE...") |
| ds = load_dataset("Team-ACE/ToolACE", split="train") |
| if max_samples: |
| ds = ds.select(range(min(max_samples, len(ds)))) |
| role_map = {"human": "user", "user": "user", "assistant": "assistant", |
| "gpt": "assistant", "system": "system", "tool": "tool"} |
| def convert(example): |
| messages = [] |
| if example.get("system"): |
| messages.append({"role": "system", "content": example["system"]}) |
| for msg in example["conversations"]: |
| role = role_map.get(msg.get("from", ""), msg.get("from", "user")) |
| content = msg.get("value", "") |
| messages.append({"role": role, "content": content}) |
| return {"messages": messages, "source": "toolace"} |
| ds = ds.map(convert, remove_columns=ds.column_names) |
| print(f" ToolACE: {len(ds)} examples") |
| return ds |
|
|
|
|
| def convert_apigen_mt(max_samples=None): |
| print("Loading Salesforce/APIGen-MT-5k...") |
| ds = load_dataset("Salesforce/APIGen-MT-5k", "dataset", split="train") |
| if max_samples: |
| ds = ds.select(range(min(max_samples, len(ds)))) |
| role_map = {"human": "user", "user": "user", "assistant": "assistant", |
| "gpt": "assistant", "system": "system", "tool": "tool"} |
| def convert(example): |
| messages = [] |
| system_content = example.get("system", "") |
| tools = example.get("tools", []) |
| if tools: |
| tools_str = json.dumps(tools, indent=2) if isinstance(tools, list) else str(tools) |
| system_content += f"\n\nAvailable tools:\n{tools_str}" |
| if system_content: |
| messages.append({"role": "system", "content": system_content}) |
| for msg in example["conversations"]: |
| role = role_map.get(msg.get("from", ""), msg.get("from", "user")) |
| content = msg.get("value", "") |
| messages.append({"role": role, "content": content}) |
| return {"messages": messages, "source": "apigen_mt"} |
| ds = ds.map(convert, remove_columns=ds.column_names) |
| print(f" APIGen-MT: {len(ds)} examples") |
| return ds |
|
|
|
|
| def convert_magicoder(max_samples=None): |
| print("Loading ise-uiuc/Magicoder-OSS-Instruct-75K...") |
| ds = load_dataset("ise-uiuc/Magicoder-OSS-Instruct-75K", split="train") |
| ds = ds.filter(lambda x: x.get("lang", "").lower() == "python") |
| if max_samples: |
| ds = ds.select(range(min(max_samples, len(ds)))) |
| def convert(example): |
| messages = [ |
| {"role": "system", "content": "You are an expert Python programmer. Write clean, well-documented code with proper error handling."}, |
| {"role": "user", "content": example["problem"]}, |
| {"role": "assistant", "content": example["solution"]}, |
| ] |
| return {"messages": messages, "source": "magicoder"} |
| ds = ds.map(convert, remove_columns=ds.column_names) |
| print(f" Magicoder (Python): {len(ds)} examples") |
| return ds |
|
|
|
|
| def convert_codeact(max_samples=None): |
| print("Loading xingyaoww/code-act (codeact split)...") |
| ds = load_dataset("xingyaoww/code-act", split="codeact") |
| if max_samples: |
| ds = ds.select(range(min(max_samples, len(ds)))) |
| role_map = {"human": "user", "user": "user", "assistant": "assistant", |
| "gpt": "assistant", "system": "system", "tool": "tool"} |
| def convert(example): |
| messages = [] |
| for msg in example["conversations"]: |
| role = msg.get("role", msg.get("from", "user")) |
| role = role_map.get(role, role) |
| content = msg.get("content", msg.get("value", "")) |
| messages.append({"role": role, "content": content}) |
| return {"messages": messages, "source": "codeact"} |
| ds = ds.map(convert, remove_columns=ds.column_names) |
| print(f" CodeAct: {len(ds)} examples") |
| return ds |
|
|
|
|
| def main(): |
| import argparse |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--output_repo", type=str, default="your-username/code-toolcall-sft-data") |
| parser.add_argument("--max_per_source", type=int, default=None) |
| parser.add_argument("--dry_run", action="store_true") |
| args = parser.parse_args() |
|
|
| max_s = args.max_per_source |
| toolace = convert_toolace(max_samples=max_s) |
| apigen = convert_apigen_mt(max_samples=max_s) |
| magicoder = convert_magicoder(max_samples=max_s) |
| codeact = convert_codeact(max_samples=max_s) |
|
|
| all_datasets = [toolace, apigen, magicoder, codeact] |
|
|
| def normalize_and_extract(ds): |
| rows = [] |
| for example in ds: |
| normalized = [] |
| for msg in example["messages"]: |
| role = msg.get("role", "user") or "user" |
| content = msg.get("content", "") or "" |
| tool_calls = msg.get("tool_calls", None) |
| if tool_calls: |
| tool_calls_str = json.dumps(tool_calls, indent=2) |
| content += f"\n\n<tool_calls>\n{tool_calls_str}\n</tool_calls>" if content else f"<tool_calls>\n{tool_calls_str}\n</tool_calls>" |
| name = msg.get("name", None) |
| if role == "tool" and name: |
| content = f"[Tool: {name}]\n{content}" |
| normalized.append({"role": str(role), "content": str(content)}) |
| source = example.get("source", "unknown") |
| rows.append({"messages": normalized, "source": source}) |
| return rows |
|
|
| all_rows = [] |
| for ds in all_datasets: |
| all_rows.extend(normalize_and_extract(ds)) |
| merged = Dataset.from_list(all_rows) |
| merged = merged.shuffle(seed=42) |
|
|
| print(f"\nTotal: {len(merged)} examples") |
| if not args.dry_run: |
| merged.push_to_hub(args.output_repo, token=HF_TOKEN) |
| print(f"Pushed to {args.output_repo}") |
| else: |
| print("[DRY RUN]") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|