code-llm-toolkit / prepare_data.py
rndubs's picture
Upload prepare_data.py
8921f86 verified
"""
Data Preparation Script: Merges code generation, tool-calling, and agentic datasets
into a unified ChatML format for SFT training.
Datasets used (all verified on HF Hub):
1. Team-ACE/ToolACE — 26K high-quality tool-calling examples (ShareGPT format)
2. Salesforce/APIGen-MT-5k — 5K multi-turn agentic tool-use trajectories
3. ise-uiuc/Magicoder-OSS-Instruct-75K — 75K code generation (Python-focused)
4. xingyaoww/code-act — 7K code-as-action + 69K general conversation
Output: A single dataset with "messages" column in ChatML format,
pushed to the Hub for training.
Usage:
# Full run (pushes to Hub)
python prepare_data.py --output_repo your-username/code-toolcall-sft-data
# Test with small sample
python prepare_data.py --max_per_source 100 --dry_run
"""
import json
import os
from datasets import load_dataset, Dataset
HF_TOKEN = os.environ.get("HF_TOKEN")
def convert_toolace(max_samples=None):
print("Loading Team-ACE/ToolACE...")
ds = load_dataset("Team-ACE/ToolACE", split="train")
if max_samples:
ds = ds.select(range(min(max_samples, len(ds))))
role_map = {"human": "user", "user": "user", "assistant": "assistant",
"gpt": "assistant", "system": "system", "tool": "tool"}
def convert(example):
messages = []
if example.get("system"):
messages.append({"role": "system", "content": example["system"]})
for msg in example["conversations"]:
role = role_map.get(msg.get("from", ""), msg.get("from", "user"))
content = msg.get("value", "")
messages.append({"role": role, "content": content})
return {"messages": messages, "source": "toolace"}
ds = ds.map(convert, remove_columns=ds.column_names)
print(f" ToolACE: {len(ds)} examples")
return ds
def convert_apigen_mt(max_samples=None):
print("Loading Salesforce/APIGen-MT-5k...")
ds = load_dataset("Salesforce/APIGen-MT-5k", "dataset", split="train")
if max_samples:
ds = ds.select(range(min(max_samples, len(ds))))
role_map = {"human": "user", "user": "user", "assistant": "assistant",
"gpt": "assistant", "system": "system", "tool": "tool"}
def convert(example):
messages = []
system_content = example.get("system", "")
tools = example.get("tools", [])
if tools:
tools_str = json.dumps(tools, indent=2) if isinstance(tools, list) else str(tools)
system_content += f"\n\nAvailable tools:\n{tools_str}"
if system_content:
messages.append({"role": "system", "content": system_content})
for msg in example["conversations"]:
role = role_map.get(msg.get("from", ""), msg.get("from", "user"))
content = msg.get("value", "")
messages.append({"role": role, "content": content})
return {"messages": messages, "source": "apigen_mt"}
ds = ds.map(convert, remove_columns=ds.column_names)
print(f" APIGen-MT: {len(ds)} examples")
return ds
def convert_magicoder(max_samples=None):
print("Loading ise-uiuc/Magicoder-OSS-Instruct-75K...")
ds = load_dataset("ise-uiuc/Magicoder-OSS-Instruct-75K", split="train")
ds = ds.filter(lambda x: x.get("lang", "").lower() == "python")
if max_samples:
ds = ds.select(range(min(max_samples, len(ds))))
def convert(example):
messages = [
{"role": "system", "content": "You are an expert Python programmer. Write clean, well-documented code with proper error handling."},
{"role": "user", "content": example["problem"]},
{"role": "assistant", "content": example["solution"]},
]
return {"messages": messages, "source": "magicoder"}
ds = ds.map(convert, remove_columns=ds.column_names)
print(f" Magicoder (Python): {len(ds)} examples")
return ds
def convert_codeact(max_samples=None):
print("Loading xingyaoww/code-act (codeact split)...")
ds = load_dataset("xingyaoww/code-act", split="codeact")
if max_samples:
ds = ds.select(range(min(max_samples, len(ds))))
role_map = {"human": "user", "user": "user", "assistant": "assistant",
"gpt": "assistant", "system": "system", "tool": "tool"}
def convert(example):
messages = []
for msg in example["conversations"]:
role = msg.get("role", msg.get("from", "user"))
role = role_map.get(role, role)
content = msg.get("content", msg.get("value", ""))
messages.append({"role": role, "content": content})
return {"messages": messages, "source": "codeact"}
ds = ds.map(convert, remove_columns=ds.column_names)
print(f" CodeAct: {len(ds)} examples")
return ds
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--output_repo", type=str, default="your-username/code-toolcall-sft-data")
parser.add_argument("--max_per_source", type=int, default=None)
parser.add_argument("--dry_run", action="store_true")
args = parser.parse_args()
max_s = args.max_per_source
toolace = convert_toolace(max_samples=max_s)
apigen = convert_apigen_mt(max_samples=max_s)
magicoder = convert_magicoder(max_samples=max_s)
codeact = convert_codeact(max_samples=max_s)
all_datasets = [toolace, apigen, magicoder, codeact]
def normalize_and_extract(ds):
rows = []
for example in ds:
normalized = []
for msg in example["messages"]:
role = msg.get("role", "user") or "user"
content = msg.get("content", "") or ""
tool_calls = msg.get("tool_calls", None)
if tool_calls:
tool_calls_str = json.dumps(tool_calls, indent=2)
content += f"\n\n<tool_calls>\n{tool_calls_str}\n</tool_calls>" if content else f"<tool_calls>\n{tool_calls_str}\n</tool_calls>"
name = msg.get("name", None)
if role == "tool" and name:
content = f"[Tool: {name}]\n{content}"
normalized.append({"role": str(role), "content": str(content)})
source = example.get("source", "unknown")
rows.append({"messages": normalized, "source": source})
return rows
all_rows = []
for ds in all_datasets:
all_rows.extend(normalize_and_extract(ds))
merged = Dataset.from_list(all_rows)
merged = merged.shuffle(seed=42)
print(f"\nTotal: {len(merged)} examples")
if not args.dry_run:
merged.push_to_hub(args.output_repo, token=HF_TOKEN)
print(f"Pushed to {args.output_repo}")
else:
print("[DRY RUN]")
if __name__ == "__main__":
main()