{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "88f0398e59ceb34c",
   "metadata": {},
   "outputs": [],
   "source": [
    "%pip install -Uqqq beautifulsoup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b18e91b44a57a3fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "import urllib.request\n",
    "from base64 import b64encode\n",
    "from io import StringIO\n",
    "from typing import Dict\n",
    "\n",
    "import bs4 as bs\n",
    "import openai\n",
    "import pandas as pd\n",
    "from faker import Faker\n",
    "from openinference.instrumentation import TraceConfig\n",
    "from openinference.instrumentation.openai import OpenAIInstrumentor\n",
    "\n",
    "import phoenix as px"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6c13fedbcbef1956",
   "metadata": {},
   "outputs": [],
   "source": [
    "from phoenix.otel import register\n",
    "\n",
    "tracer_provider = register(endpoint=\"http://127.0.0.1:4317\", project_name=\"vision-fixture\")\n",
    "config = TraceConfig(base64_image_max_length=1_000_000_000)\n",
    "OpenAIInstrumentor().instrument(tracer_provider=tracer_provider, config=config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9215c1d3c571713c",
   "metadata": {},
   "outputs": [],
   "source": [
    "source = urllib.request.urlopen(\"https://nextml.github.io/caption-contest-data/\").read()\n",
    "table = bs.BeautifulSoup(source).find_all(\"table\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15f91c5e5d5e6bc1",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_html(StringIO(str(table)))[0].iloc[:, [0, 2, -1]]\n",
    "df.sort_values(\"Number of votes\", ascending=False, inplace=True)\n",
    "df = (\n",
    "    df.set_index(df.iloc[:, 0].apply(lambda s: int(s.split()[0])))\n",
    "    .rename_axis(None, axis=0)\n",
    "    .iloc[:, [1, 2]]\n",
    ")\n",
    "df.rename(dict(zip(df.columns, [\"caption\", \"votes\"])), axis=1, inplace=True)\n",
    "print(len(df))\n",
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "506282eb0d90983b",
   "metadata": {},
   "outputs": [],
   "source": [
    "client = openai.OpenAI()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "72cf51ebd72e84fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "def message(idx: int, caption: str) -> Dict[str, str]:\n",
    "    url = f\"https://nextml.github.io/caption-contest-data/cartoons/{idx}.jpg\"\n",
    "    text = f\"Explain like I'm five. What's funny about this caption?\\n\\n{caption}\\n\"\n",
    "    return {\n",
    "        \"role\": \"user\",\n",
    "        \"content\": [\n",
    "            {\"type\": \"text\", \"text\": text},\n",
    "            {\"type\": \"image_url\", \"image_url\": {\"url\": f\"{url}\", \"detail\": \"low\"}},\n",
    "        ],\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e65e05252cc5a675",
   "metadata": {},
   "outputs": [],
   "source": [
    "n, errors = 25, 0\n",
    "for idx, caption, _ in df.itertuples():\n",
    "    if n == 0 or errors > 3:\n",
    "        break\n",
    "    messages = [message(idx, caption)]\n",
    "    try:\n",
    "        client.chat.completions.create(model=\"gpt-4o-mini\", messages=messages, max_tokens=1000)\n",
    "    except BaseException:\n",
    "        errors += 1\n",
    "    else:\n",
    "        errors = 0\n",
    "        n -= 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e9789799a7e5ff94",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_parquet(\"hf://datasets/ChartMimic/ChartMimic/test.parquet\")\n",
    "df = df.loc[df.Difficulty == \"hard\"].sort_values(\n",
    "    by=\"Instruction\", key=lambda c: c.apply(len), ascending=False\n",
    ")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "156d457ec9b97253",
   "metadata": {},
   "outputs": [],
   "source": [
    "for _, instruction, input_figure in (\n",
    "    df.loc[:, [\"Instruction\", \"InputFigurePreview\"]].iloc[1:25].itertuples()\n",
    "):\n",
    "    bytes = input_figure[\"bytes\"]\n",
    "    encoded_string = b64encode(bytes).decode()\n",
    "    message = {\n",
    "        \"role\": \"user\",\n",
    "        \"content\": [\n",
    "            {\"type\": \"text\", \"text\": instruction},\n",
    "            {\n",
    "                \"type\": \"image_url\",\n",
    "                \"image_url\": {\"url\": f\"data:image/png;base64,{encoded_string}\", \"detail\": \"low\"},\n",
    "            },\n",
    "        ],\n",
    "    }\n",
    "    client.chat.completions.create(model=\"gpt-4o-mini\", messages=[message], max_tokens=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ee9cd888d968c194",
   "metadata": {},
   "outputs": [],
   "source": [
    "td = px.Client().get_trace_dataset(timeout=1000, project_name=\"vision-fixture\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fbf96e075f00375b",
   "metadata": {},
   "outputs": [],
   "source": [
    "fake = Faker()\n",
    "start_time = pd.Series(\n",
    "    [fake.date_time_between(\"-3d\") for _ in range(len(td.dataframe))], index=td.dataframe.index\n",
    ")\n",
    "duration = td.dataframe.end_time - td.dataframe.start_time\n",
    "end_time = start_time + duration\n",
    "td.dataframe[\"start_time\"] = start_time\n",
    "td.dataframe[\"end_time\"] = end_time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ad88f0c25af44a4b",
   "metadata": {},
   "outputs": [],
   "source": [
    "td.save()"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}