{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "N3shQZoZPScM", "outputId": "63642e05-bd32-4fd9-f029-8f50148a1e8a" }, "outputs": [], "source": [ "!pip install -U sentence_transformers --q" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "rcBH0FzwVOk6", "outputId": "f5b4b762-9b30-4474-d1d0-7ba3ab68a2ef" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip3 install --upgrade pip\u001b[0m\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "pip install datasets --q" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "y-pDMu97XyVd", "outputId": "737160a3-2c34-4293-a129-bb053cd91117" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting sentence-transformers\n", " Using cached sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)\n", "Requirement already satisfied: scikit-learn in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (1.4.1.post1)\n", "Requirement already satisfied: pandas in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (2.2.1)\n", "Collecting torch\n", " Downloading torch-2.6.0-cp312-none-macosx_11_0_arm64.whl.metadata (28 kB)\n", "Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)\n", " Downloading transformers-4.48.3-py3-none-any.whl.metadata (44 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.4/44.4 kB\u001b[0m \u001b[31m2.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: tqdm in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from sentence-transformers) (4.67.1)\n", "Requirement already satisfied: scipy in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from sentence-transformers) (1.12.0)\n", "Requirement already satisfied: huggingface-hub>=0.20.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from sentence-transformers) (0.28.1)\n", "Requirement already satisfied: Pillow in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from sentence-transformers) (10.2.0)\n", "Requirement already satisfied: numpy<2.0,>=1.19.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from scikit-learn) (1.26.4)\n", "Requirement already satisfied: joblib>=1.2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from scikit-learn) (1.3.2)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from scikit-learn) (3.3.0)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /Users/markushenriksson/Library/Python/3.12/lib/python/site-packages (from pandas) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from pandas) (2024.1)\n", "Requirement already satisfied: tzdata>=2022.7 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from pandas) (2024.1)\n", "Requirement already satisfied: filelock in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from torch) (3.16.0)\n", "Requirement already satisfied: typing-extensions>=4.10.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from torch) (4.10.0)\n", "Collecting networkx (from torch)\n", " Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)\n", "Requirement already satisfied: jinja2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from torch) (3.1.3)\n", "Requirement already satisfied: fsspec in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from torch) (2024.2.0)\n", "Requirement already satisfied: setuptools in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from torch) (69.1.1)\n", "Collecting sympy==1.13.1 (from torch)\n", " Using cached sympy-1.13.1-py3-none-any.whl.metadata (12 kB)\n", "Collecting mpmath<1.4,>=1.1.0 (from sympy==1.13.1->torch)\n", " Using cached mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)\n", "Requirement already satisfied: packaging>=20.9 in /Users/markushenriksson/Library/Python/3.12/lib/python/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (24.0)\n", "Requirement already satisfied: pyyaml>=5.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (6.0.1)\n", "Requirement already satisfied: requests in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (2.32.3)\n", "Requirement already satisfied: six>=1.5 in /Users/markushenriksson/Library/Python/3.12/lib/python/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n", "Collecting regex!=2019.12.17 (from transformers<5.0.0,>=4.41.0->sentence-transformers)\n", " Using cached regex-2024.11.6-cp312-cp312-macosx_11_0_arm64.whl.metadata (40 kB)\n", "Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers)\n", " Downloading tokenizers-0.21.0-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.7 kB)\n", "Collecting safetensors>=0.4.1 (from transformers<5.0.0,>=4.41.0->sentence-transformers)\n", " Downloading safetensors-0.5.2-cp38-abi3-macosx_11_0_arm64.whl.metadata (3.8 kB)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from jinja2->torch) (2.1.5)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (3.6)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2.0.7)\n", "Requirement already satisfied: certifi>=2017.4.17 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2024.2.2)\n", "Using cached sentence_transformers-3.4.1-py3-none-any.whl (275 kB)\n", "Downloading torch-2.6.0-cp312-none-macosx_11_0_arm64.whl (66.5 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m66.5/66.5 MB\u001b[0m \u001b[31m24.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hUsing cached sympy-1.13.1-py3-none-any.whl (6.2 MB)\n", "Downloading transformers-4.48.3-py3-none-any.whl (9.7 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.7/9.7 MB\u001b[0m \u001b[31m35.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hUsing cached networkx-3.4.2-py3-none-any.whl (1.7 MB)\n", "Using cached mpmath-1.3.0-py3-none-any.whl (536 kB)\n", "Using cached regex-2024.11.6-cp312-cp312-macosx_11_0_arm64.whl (284 kB)\n", "Downloading safetensors-0.5.2-cp38-abi3-macosx_11_0_arm64.whl (408 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m408.9/408.9 kB\u001b[0m \u001b[31m27.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading tokenizers-0.21.0-cp39-abi3-macosx_11_0_arm64.whl (2.6 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.6/2.6 MB\u001b[0m \u001b[31m36.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", "\u001b[?25hInstalling collected packages: mpmath, sympy, safetensors, regex, networkx, torch, tokenizers, transformers, sentence-transformers\n", "Successfully installed mpmath-1.3.0 networkx-3.4.2 regex-2024.11.6 safetensors-0.5.2 sentence-transformers-3.4.1 sympy-1.13.1 tokenizers-0.21.0 torch-2.6.0 transformers-4.48.3\n", "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip3 install --upgrade pip\u001b[0m\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "pip install sentence-transformers scikit-learn pandas torch\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "m-tmgXuldd3C" }, "outputs": [], "source": [ "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "id": "1Z0mgYZEgjC4" }, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestClassifier" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "id": "aBmXLbZ4cc1U" }, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "LXkXdIWgUcWI" }, "outputs": [], "source": [ "from datasets import load_dataset, Dataset\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "id": "AFkI23ySgtkV" }, "outputs": [], "source": [ "from sklearn.metrics import accuracy_score" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ehvh1BJZWa_1", "outputId": "212a5f82-885d-4e61-a73f-94dcf12a3a39" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ab0344420d964f64a16c911f17aae057", "version_major": 2, "version_minor": 0 }, "text/plain": [ "README.md: 0%| | 0.00/515 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9f86cbde053f4b9e91cff137a924082f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "train-00000-of-00001.parquet: 0%| | 0.00/5.89M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2f6d00487331444299405cc97d4b18ea", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating train split: 0%| | 0/61199 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " answer system_prompt \\\n", "0 neutral You are a financial sentiment analysis expert.... \n", "1 neutral You are a financial sentiment analysis expert.... \n", "2 negative You are a financial sentiment analysis expert.... \n", "3 positive You are a financial sentiment analysis expert.... \n", "4 positive You are a financial sentiment analysis expert.... \n", "\n", " user_prompt task_type \n", "0 According to Gran , the company has no plans t... sentiment_analysis \n", "1 Technopolis plans to develop in stages an area... sentiment_analysis \n", "2 The international electronic industry company ... sentiment_analysis \n", "3 With the new production plant the company woul... sentiment_analysis \n", "4 According to the company 's updated strategy f... sentiment_analysis \n" ] } ], "source": [ "df = load_dataset(\"NickyNicky/Finance_sentiment_and_topic_classification_En\")\n", "\n", "# Converting 'train' split to a Pandas DataFrame\n", "df = pd.DataFrame(df['train'])\n", "\n", "\n", "print(df.head())\n", "\n", "\n", "df.to_csv(\"train_data.csv\", index=False)\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "wS-PmD5WWnYC", "outputId": "36732946-2bb0-4f58-f784-5619d77698b9" }, "outputs": [ { "data": { "text/html": [ "
\n", " | answer | \n", "system_prompt | \n", "user_prompt | \n", "task_type | \n", "
---|---|---|---|---|
0 | \n", "neutral | \n", "You are a financial sentiment analysis expert.... | \n", "According to Gran , the company has no plans t... | \n", "sentiment_analysis | \n", "
1 | \n", "neutral | \n", "You are a financial sentiment analysis expert.... | \n", "Technopolis plans to develop in stages an area... | \n", "sentiment_analysis | \n", "
2 | \n", "negative | \n", "You are a financial sentiment analysis expert.... | \n", "The international electronic industry company ... | \n", "sentiment_analysis | \n", "
3 | \n", "positive | \n", "You are a financial sentiment analysis expert.... | \n", "With the new production plant the company woul... | \n", "sentiment_analysis | \n", "
4 | \n", "positive | \n", "You are a financial sentiment analysis expert.... | \n", "According to the company 's updated strategy f... | \n", "sentiment_analysis | \n", "
\n", " | answer | \n", "user_prompt | \n", "
---|---|---|
0 | \n", "neutral | \n", "According to Gran , the company has no plans t... | \n", "
1 | \n", "neutral | \n", "Technopolis plans to develop in stages an area... | \n", "
2 | \n", "negative | \n", "The international electronic industry company ... | \n", "
3 | \n", "positive | \n", "With the new production plant the company woul... | \n", "
4 | \n", "positive | \n", "According to the company 's updated strategy f... | \n", "
... | \n", "... | \n", "... | \n", "
61194 | \n", "Treasuries | Corporate Debt | \n", "KfW credit line for Uniper could be raised to ... | \n", "
61195 | \n", "Treasuries | Corporate Debt | \n", "KfW credit line for Uniper could be raised to ... | \n", "
61196 | \n", "Treasuries | Corporate Debt | \n", "Russian https://t.co/R0iPhyo5p7 sells 1 bln r... | \n", "
61197 | \n", "Treasuries | Corporate Debt | \n", "Global ESG bond issuance posts H1 dip as supra... | \n", "
61198 | \n", "Treasuries | Corporate Debt | \n", "Brazil's Petrobras says it signed a $1.25 bill... | \n", "
61199 rows × 2 columns
\n", "RandomForestClassifier(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(random_state=42)
RandomForestClassifier(max_depth=20, min_samples_split=5, n_estimators=200,\n", " random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(max_depth=20, min_samples_split=5, n_estimators=200,\n", " random_state=42)