diff --git a/notebooks/README.md b/notebooks/README.md new file mode 100644 index 0000000..4f5c571 --- /dev/null +++ b/notebooks/README.md @@ -0,0 +1,3 @@ +# Notebooks + +Data science for MyHerodotus is performed using Jupyter notebooks running on Vertex AI Workbench. However, the file size of Jupyter notebooks makes it prohibitive to download them. Thus, all notebooks are stored in the `data- \ No newline at end of file diff --git a/notebooks/fine-tuning.ipynb b/notebooks/fine-tuning.ipynb deleted file mode 100644 index 2a2e13b..0000000 --- a/notebooks/fine-tuning.ipynb +++ /dev/null @@ -1,915 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "e2b8a82f-9020-477d-abed-2e6f0e081c0f", - "metadata": {}, - "source": [ - "# Fine tune a Gemini and a Gemma model\n", - "\n", - "+ Dataset: [Guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco)\n", - "+ Prepare the data: https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini-supervised-tuning-prepare\n", - "+ Gemma: https://huggingface.co/google/gemma-2-27b-it-pytorch\n", - " - Also https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/gemma2\n", - " - Also https://www.kaggle.com/models/google/gemma-2\n", - "+ Gemini: https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini-use-supervised-tuning" - ] - }, - { - "cell_type": "markdown", - "id": "d5e90114-d5b6-4382-a0ed-f50da90d022d", - "metadata": {}, - "source": [ - "## Step 0. Install and import libraries" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "1bb3a321-1db0-404c-87eb-99038430bff8", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Writing requirements.txt\n" - ] - } - ], - "source": [ - "%%writefile requirements.txt\n", - "datasets\n", - "pandas\n", - "torch\n", - "google-cloud-aiplatform\n", - "google-cloud-storage\n", - "jsonschema" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7f542d78-860a-44e7-a200-8bd353a1233b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!pip install --upgrade -r requirements.txt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3d916ee2-00d7-4719-afe3-d5e97215c34e", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!pip install datasets" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "bb50502e-9218-450a-ad0c-d08159df55c7", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: jsonschema in /opt/conda/lib/python3.10/site-packages (4.23.0)\n", - "Requirement already satisfied: attrs>=22.2.0 in /opt/conda/lib/python3.10/site-packages (from jsonschema) (24.2.0)\n", - "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /opt/conda/lib/python3.10/site-packages (from jsonschema) (2023.12.1)\n", - "Requirement already satisfied: referencing>=0.28.4 in /opt/conda/lib/python3.10/site-packages (from jsonschema) (0.35.1)\n", - "Requirement already satisfied: rpds-py>=0.7.1 in /opt/conda/lib/python3.10/site-packages (from jsonschema) (0.20.0)\n" - ] - } - ], - "source": [ - "!pip install jsonschema" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "id": "bdba50cd-c52b-462d-8e88-1c2c923599bf", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import json\n", - "import pandas as pd\n", - "import time\n", - "import torch\n", - "from datasets import load_dataset\n", - "from jsonschema import validate\n", - "from jsonschema.protocols import Validator\n", - "\n", - "import vertexai\n", - "from vertexai.generative_models import GenerativeModel\n", - "from vertexai.tuning import sft\n", - "\n", - "from google.cloud import storage\n", - "from google.cloud import aiplatform" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "ef947d19-fd2c-424e-8742-fa5fcace3a38", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "README.md: 100%|███████████████████████████| 7.62k/7.62k [00:00<00:00, 19.5MB/s]\n", - "train-00000-of-00001.parquet: 100%|█████████| 14.5M/14.5M [00:00<00:00, 149MB/s]\n", - "validation-00000-of-00001.parquet: 100%|████| 1.82M/1.82M [00:00<00:00, 295MB/s]\n", - "Generating train split: 100%|██| 87599/87599 [00:00<00:00, 103689.07 examples/s]\n", - "Generating validation split: 100%|█| 10570/10570 [00:00<00:00, 302876.11 example\n", - "{'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}\n" - ] - } - ], - "source": [ - "!python -c \"from datasets import load_dataset; print(load_dataset('squad', split='train')[0])\"" - ] - }, - { - "cell_type": "markdown", - "id": "9e5760f1-f238-4a50-a3f0-c0117d8bc7e3", - "metadata": {}, - "source": [ - "## Step 1. Transform dataset for Vertex\n", - "\n", - "GUANACO dataset shape:\n", - "\n", - "```json\n", - "{\n", - " \"text\": \"### Human: blah blah .### Assistant: blah blah.### Human: blah blah blah\"\n", - "}\n", - "\n", - "```\n", - "\n", - "Vertex tuning dataset shape. Note that the `systemInstruction` field is optional -- and not necessary for this\n", - "exercise. [From here](https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini-supervised-tuning-prepare).\n", - "\n", - "```json\n", - "{\n", - " \"systemInstruction\": {\n", - " \"role\": string,\n", - " \"parts\": [\n", - " {\n", - " \"text\": string\n", - " }\n", - " ]\n", - " },\n", - " \"contents\": [\n", - " {\n", - " \"role\": string, // must be \"user\" or \"model\"\n", - " \"parts\": [\n", - " {\n", - " // Union field data can be only one of the following:\n", - " \"text\": string,\n", - " \"fileData\": {\n", - " \"mimeType\": string,\n", - " \"fileUri\": string\n", - " }\n", - " }\n", - " ]\n", - " }\n", - " ]\n", - "}\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "cd5b2560-18d2-4691-a7ef-747e16ebabdd", - "metadata": {}, - "source": [ - "Here is a pseudocode transform for the dataset:\n", - "\n", - "1. Load a row from the Guanaco dataset.\n", - "1. Read the `text` field.\n", - "1. Split the `text` field on the `###` character string.\n", - "1. Read the first substring of each item in the list, reading up to the `:` character.\n", - "1. If the first substring is \"Human\", create a new dictionary like so:\n", - "\n", - " ```json\n", - " {\n", - " \"role\": \"user\",\n", - " \"parts\": {\n", - " \"text\": \"[REMAINDER OF SPLIT\"\n", - " }\n", - " }\n", - " ```\n", - "1. If the first substring is \"Assistant\", create a new dictionary like so:\n", - "\n", - " ```json\n", - " {\n", - " \"role\": \"model\",\n", - " \"parts\": {\n", - " \"text\": \"[REMAINDER OF SPLIT\"\n", - " }\n", - " }\n", - " ```\n", - "1. Append each dictionary to a list.\n", - "1. Create one last new dictionary and set the `contents` field like so:\n", - "\n", - " ```json\n", - " {\n", - " \"contents\": [TEXT DICTIONARIES]\n", - " }\n", - " ```" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "9c820ca1-bd8f-4872-8839-fb879822c341", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Create the schema validation\n", - "# NOTE: This is only a partial schema for this data transform purpose.\n", - "schema = {\n", - " \"type\": \"object\",\n", - " \"properties\": {\n", - " \"contents\": {\n", - " \"type\": \"array\",\n", - " \"items\": {\n", - " \"type\": \"object\",\n", - " \"properties\": {\n", - " \"role\": { \"type\": \"string\" },\n", - " \"parts\": {\n", - " \"type\": \"array\",\n", - " \"items\": {\n", - " \"type\": \"object\",\n", - " \"properties\": {\n", - " \"text\": { \"type\": \"string\" }\n", - " }\n", - " }\n", - " }\n", - " }\n", - " }\n", - " }\n", - " }\n", - "}\n", - "Validator.check_schema(schema)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1ba8036a-977b-4698-8009-49a642484dab", - "metadata": {}, - "outputs": [], - "source": [ - "# Pandas\n", - "splits = {'train': 'openassistant_best_replies_train.jsonl', 'test': 'openassistant_best_replies_eval.jsonl'}\n", - "df = pd.read_json(\"hf://datasets/timdettmers/openassistant-guanaco/\" + splits[\"train\", lines=True])" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "946307ee-1148-4b4c-936e-c29e544d1773", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "69fed47f25734ac984687c3b428ca44d", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "README.md: 0%| | 0.00/395 [00:00 google.cloud.aiplatform_v1beta1.types.llm_utility_service.ComputeTokensResponse\n", - " | Computes tokens.\n", - " | \n", - " | Args:\n", - " | contents: Contents to send to the model.\n", - " | Supports either a list of Content objects (passing a multi-turn conversation)\n", - " | or a value that can be converted to a single Content object (passing a single message).\n", - " | Supports\n", - " | * str, Image, Part,\n", - " | * List[Union[str, Image, Part]],\n", - " | * List[Content]\n", - " | \n", - " | Returns:\n", - " | A ComputeTokensResponse object that has the following attributes:\n", - " | tokens_info: Lists of tokens_info from the input.\n", - " | The input `contents: ContentsType` could have\n", - " | multiple string instances and each tokens_info\n", - " | item represents each string instance. Each token\n", - " | info consists tokens list, token_ids list and\n", - " | a role.\n", - " | \n", - " | async compute_tokens_async(self, contents: Union[List[ForwardRef('Content')], List[Dict[str, Any]], str, ForwardRef('Image'), ForwardRef('Part'), List[Union[str, ForwardRef('Image'), ForwardRef('Part')]]]) -> google.cloud.aiplatform_v1beta1.types.llm_utility_service.ComputeTokensResponse\n", - " | Computes tokens asynchronously.\n", - " | \n", - " | Args:\n", - " | contents: Contents to send to the model.\n", - " | Supports either a list of Content objects (passing a multi-turn conversation)\n", - " | or a value that can be converted to a single Content object (passing a single message).\n", - " | Supports\n", - " | * str, Image, Part,\n", - " | * List[Union[str, Image, Part]],\n", - " | * List[Content]\n", - " | \n", - " | Returns:\n", - " | And awaitable for a ComputeTokensResponse object that has the following attributes:\n", - " | tokens_info: Lists of tokens_info from the input.\n", - " | The input `contents: ContentsType` could have\n", - " | multiple string instances and each tokens_info\n", - " | item represents each string instance. Each token\n", - " | info consists tokens list, token_ids list and\n", - " | a role.\n", - " | \n", - " | count_tokens(self, contents: Union[List[ForwardRef('Content')], List[Dict[str, Any]], str, ForwardRef('Image'), ForwardRef('Part'), List[Union[str, ForwardRef('Image'), ForwardRef('Part')]]], *, tools: Optional[List[ForwardRef('Tool')]] = None) -> google.cloud.aiplatform_v1beta1.types.prediction_service.CountTokensResponse\n", - " | Counts tokens.\n", - " | \n", - " | Args:\n", - " | contents: Contents to send to the model.\n", - " | Supports either a list of Content objects (passing a multi-turn conversation)\n", - " | or a value that can be converted to a single Content object (passing a single message).\n", - " | Supports\n", - " | * str, Image, Part,\n", - " | * List[Union[str, Image, Part]],\n", - " | * List[Content]\n", - " | tools: A list of tools (functions) that the model can try calling.\n", - " | \n", - " | Returns:\n", - " | A CountTokensResponse object that has the following attributes:\n", - " | total_tokens: The total number of tokens counted across all instances from the request.\n", - " | total_billable_characters: The total number of billable characters counted across all instances from the request.\n", - " | \n", - " | async count_tokens_async(self, contents: Union[List[ForwardRef('Content')], List[Dict[str, Any]], str, ForwardRef('Image'), ForwardRef('Part'), List[Union[str, ForwardRef('Image'), ForwardRef('Part')]]], *, tools: Optional[List[ForwardRef('Tool')]] = None) -> google.cloud.aiplatform_v1beta1.types.prediction_service.CountTokensResponse\n", - " | Counts tokens asynchronously.\n", - " | \n", - " | Args:\n", - " | contents: Contents to send to the model.\n", - " | Supports either a list of Content objects (passing a multi-turn conversation)\n", - " | or a value that can be converted to a single Content object (passing a single message).\n", - " | Supports\n", - " | * str, Image, Part,\n", - " | * List[Union[str, Image, Part]],\n", - " | * List[Content]\n", - " | tools: A list of tools (functions) that the model can try calling.\n", - " | \n", - " | Returns:\n", - " | And awaitable for a CountTokensResponse object that has the following attributes:\n", - " | total_tokens: The total number of tokens counted across all instances from the request.\n", - " | total_billable_characters: The total number of billable characters counted across all instances from the request.\n", - " | \n", - " | generate_content(self, contents: Union[List[ForwardRef('Content')], List[Dict[str, Any]], str, ForwardRef('Image'), ForwardRef('Part'), List[Union[str, ForwardRef('Image'), ForwardRef('Part')]]], *, generation_config: Union[ForwardRef('GenerationConfig'), Dict[str, Any], NoneType] = None, safety_settings: Union[List[ForwardRef('SafetySetting')], Dict[google.cloud.aiplatform_v1beta1.types.content.HarmCategory, google.cloud.aiplatform_v1beta1.types.content.SafetySetting.HarmBlockThreshold], NoneType] = None, tools: Optional[List[ForwardRef('Tool')]] = None, tool_config: Optional[ForwardRef('ToolConfig')] = None, stream: bool = False) -> Union[ForwardRef('GenerationResponse'), Iterable[ForwardRef('GenerationResponse')]]\n", - " | Generates content.\n", - " | \n", - " | Args:\n", - " | contents: Contents to send to the model.\n", - " | Supports either a list of Content objects (passing a multi-turn conversation)\n", - " | or a value that can be converted to a single Content object (passing a single message).\n", - " | Supports\n", - " | * str, Image, Part,\n", - " | * List[Union[str, Image, Part]],\n", - " | * List[Content]\n", - " | generation_config: Parameters for the generation.\n", - " | safety_settings: Safety settings as a mapping from HarmCategory to HarmBlockThreshold.\n", - " | tools: A list of tools (functions) that the model can try calling.\n", - " | tool_config: Config shared for all tools provided in the request.\n", - " | stream: Whether to stream the response.\n", - " | \n", - " | Returns:\n", - " | A single GenerationResponse object if stream == False\n", - " | A stream of GenerationResponse objects if stream == True\n", - " | \n", - " | async generate_content_async(self, contents: Union[List[ForwardRef('Content')], List[Dict[str, Any]], str, ForwardRef('Image'), ForwardRef('Part'), List[Union[str, ForwardRef('Image'), ForwardRef('Part')]]], *, generation_config: Union[ForwardRef('GenerationConfig'), Dict[str, Any], NoneType] = None, safety_settings: Union[List[ForwardRef('SafetySetting')], Dict[google.cloud.aiplatform_v1beta1.types.content.HarmCategory, google.cloud.aiplatform_v1beta1.types.content.SafetySetting.HarmBlockThreshold], NoneType] = None, tools: Optional[List[ForwardRef('Tool')]] = None, tool_config: Optional[ForwardRef('ToolConfig')] = None, stream: bool = False) -> Union[ForwardRef('GenerationResponse'), AsyncIterable[ForwardRef('GenerationResponse')]]\n", - " | Generates content asynchronously.\n", - " | \n", - " | Args:\n", - " | contents: Contents to send to the model.\n", - " | Supports either a list of Content objects (passing a multi-turn conversation)\n", - " | or a value that can be converted to a single Content object (passing a single message).\n", - " | Supports\n", - " | * str, Image, Part,\n", - " | * List[Union[str, Image, Part]],\n", - " | * List[Content]\n", - " | generation_config: Parameters for the generation.\n", - " | safety_settings: Safety settings as a mapping from HarmCategory to HarmBlockThreshold.\n", - " | tools: A list of tools (functions) that the model can try calling.\n", - " | tool_config: Config shared for all tools provided in the request.\n", - " | stream: Whether to stream the response.\n", - " | \n", - " | Returns:\n", - " | An awaitable for a single GenerationResponse object if stream == False\n", - " | An awaitable for a stream of GenerationResponse objects if stream == True\n", - " | \n", - " | start_chat(self, *, history: Optional[List[ForwardRef('Content')]] = None, response_validation: bool = True) -> 'ChatSession'\n", - " | Creates a stateful chat session.\n", - " | \n", - " | Args:\n", - " | history: Previous history to initialize the chat session.\n", - " | response_validation: Whether to validate responses before adding\n", - " | them to chat history. By default, `send_message` will raise\n", - " | error if the request or response is blocked or if the response\n", - " | is incomplete due to going over the max token limit.\n", - " | If set to `False`, the chat session history will always\n", - " | accumulate the request and response messages even if the\n", - " | reponse if blocked or incomplete. This can result in an unusable\n", - " | chat session state.\n", - " | \n", - " | Returns:\n", - " | A ChatSession object.\n", - " | \n", - " | ----------------------------------------------------------------------\n", - " | Data descriptors inherited from vertexai.generative_models._generative_models._GenerativeModel:\n", - " | \n", - " | __dict__\n", - " | dictionary for instance variables (if defined)\n", - " | \n", - " | __weakref__\n", - " | list of weak references to the object (if defined)\n", - "\n" - ] - } - ], - "source": [ - "help(tuned_model)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "10c60cce-a703-438c-b00a-a21414c0ad64", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "PyTorch 2.0 (Local)", - "language": "python", - "name": "pytorch-2-0" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}