{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Wrangle Wiki Toxic" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "split = \"test\"\n", "df = pd.read_parquet(\n", " f\"https://huggingface.co/api/datasets/OxAISH-AL-LLM/wiki_toxic/parquet/default/{split}/0.parquet\"\n", ")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"toxic\"] = df[\"label\"].map(bool)\n", "df = df.drop(columns=[\"label\"])\n", "df = df.rename(columns={\"comment_text\": \"text\"})\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.to_json(f\"wiki_toxic-{split}.jsonl\", orient=\"records\", lines=True)" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 2 }