{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "88f0398e59ceb34c", "metadata": {}, "outputs": [], "source": [ "%pip install -Uqqq beautifulsoup" ] }, { "cell_type": "code", "execution_count": null, "id": "b18e91b44a57a3fa", "metadata": {}, "outputs": [], "source": [ "import urllib.request\n", "from base64 import b64encode\n", "from io import StringIO\n", "from typing import Dict\n", "\n", "import bs4 as bs\n", "import openai\n", "import pandas as pd\n", "from faker import Faker\n", "from openinference.instrumentation import TraceConfig\n", "from openinference.instrumentation.openai import OpenAIInstrumentor\n", "\n", "import phoenix as px" ] }, { "cell_type": "code", "execution_count": null, "id": "6c13fedbcbef1956", "metadata": {}, "outputs": [], "source": [ "from phoenix.otel import register\n", "\n", "tracer_provider = register(endpoint=\"http://127.0.0.1:4317\", project_name=\"vision-fixture\")\n", "config = TraceConfig(base64_image_max_length=1_000_000_000)\n", "OpenAIInstrumentor().instrument(tracer_provider=tracer_provider, config=config)" ] }, { "cell_type": "code", "execution_count": null, "id": "9215c1d3c571713c", "metadata": {}, "outputs": [], "source": [ "source = urllib.request.urlopen(\"https://nextml.github.io/caption-contest-data/\").read()\n", "table = bs.BeautifulSoup(source).find_all(\"table\")" ] }, { "cell_type": "code", "execution_count": null, "id": "15f91c5e5d5e6bc1", "metadata": {}, "outputs": [], "source": [ "df = pd.read_html(StringIO(str(table)))[0].iloc[:, [0, 2, -1]]\n", "df.sort_values(\"Number of votes\", ascending=False, inplace=True)\n", "df = (\n", " df.set_index(df.iloc[:, 0].apply(lambda s: int(s.split()[0])))\n", " .rename_axis(None, axis=0)\n", " .iloc[:, [1, 2]]\n", ")\n", "df.rename(dict(zip(df.columns, [\"caption\", \"votes\"])), axis=1, inplace=True)\n", "print(len(df))\n", "df.head(5)" ] }, { "cell_type": "code", "execution_count": null, "id": "506282eb0d90983b", "metadata": {}, "outputs": [], "source": [ "client = openai.OpenAI()" ] }, { "cell_type": "code", "execution_count": null, "id": "72cf51ebd72e84fe", "metadata": {}, "outputs": [], "source": [ "def message(idx: int, caption: str) -> Dict[str, str]:\n", " url = f\"https://nextml.github.io/caption-contest-data/cartoons/{idx}.jpg\"\n", " text = f\"Explain like I'm five. What's funny about this caption?\\n\\n{caption}\\n\"\n", " return {\n", " \"role\": \"user\",\n", " \"content\": [\n", " {\"type\": \"text\", \"text\": text},\n", " {\"type\": \"image_url\", \"image_url\": {\"url\": f\"{url}\", \"detail\": \"low\"}},\n", " ],\n", " }" ] }, { "cell_type": "code", "execution_count": null, "id": "e65e05252cc5a675", "metadata": {}, "outputs": [], "source": [ "n, errors = 25, 0\n", "for idx, caption, _ in df.itertuples():\n", " if n == 0 or errors > 3:\n", " break\n", " messages = [message(idx, caption)]\n", " try:\n", " client.chat.completions.create(model=\"gpt-4o-mini\", messages=messages, max_tokens=1000)\n", " except BaseException:\n", " errors += 1\n", " else:\n", " errors = 0\n", " n -= 1" ] }, { "cell_type": "code", "execution_count": null, "id": "e9789799a7e5ff94", "metadata": {}, "outputs": [], "source": [ "df = pd.read_parquet(\"hf://datasets/ChartMimic/ChartMimic/test.parquet\")\n", "df = df.loc[df.Difficulty == \"hard\"].sort_values(\n", " by=\"Instruction\", key=lambda c: c.apply(len), ascending=False\n", ")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "156d457ec9b97253", "metadata": {}, "outputs": [], "source": [ "for _, instruction, input_figure in (\n", " df.loc[:, [\"Instruction\", \"InputFigurePreview\"]].iloc[1:25].itertuples()\n", "):\n", " bytes = input_figure[\"bytes\"]\n", " encoded_string = b64encode(bytes).decode()\n", " message = {\n", " \"role\": \"user\",\n", " \"content\": [\n", " {\"type\": \"text\", \"text\": instruction},\n", " {\n", " \"type\": \"image_url\",\n", " \"image_url\": {\"url\": f\"data:image/png;base64,{encoded_string}\", \"detail\": \"low\"},\n", " },\n", " ],\n", " }\n", " client.chat.completions.create(model=\"gpt-4o-mini\", messages=[message], max_tokens=1000)" ] }, { "cell_type": "code", "execution_count": null, "id": "ee9cd888d968c194", "metadata": {}, "outputs": [], "source": [ "td = px.Client().get_trace_dataset(timeout=1000, project_name=\"vision-fixture\")" ] }, { "cell_type": "code", "execution_count": null, "id": "fbf96e075f00375b", "metadata": {}, "outputs": [], "source": [ "fake = Faker()\n", "start_time = pd.Series(\n", " [fake.date_time_between(\"-3d\") for _ in range(len(td.dataframe))], index=td.dataframe.index\n", ")\n", "duration = td.dataframe.end_time - td.dataframe.start_time\n", "end_time = start_time + duration\n", "td.dataframe[\"start_time\"] = start_time\n", "td.dataframe[\"end_time\"] = end_time" ] }, { "cell_type": "code", "execution_count": null, "id": "ad88f0c25af44a4b", "metadata": {}, "outputs": [], "source": [ "td.save()" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 5 }