From 091e93392bef64f51234fcdf2902463a0585bdac Mon Sep 17 00:00:00 2001 From: Valentino Date: Mon, 30 Jan 2023 03:26:11 -0800 Subject: [PATCH] added custom functions for each dataset to conver QA pais to Instruct-Reponse conversations (#922) --- .../unified-qa/unified-qa.ipynb | 935 +++++++++++++++--- 1 file changed, 813 insertions(+), 122 deletions(-) diff --git a/notebooks/data-augmentation/unified-qa/unified-qa.ipynb b/notebooks/data-augmentation/unified-qa/unified-qa.ipynb index 37c85b3d..f19b8746 100644 --- a/notebooks/data-augmentation/unified-qa/unified-qa.ipynb +++ b/notebooks/data-augmentation/unified-qa/unified-qa.ipynb @@ -9,7 +9,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "b2e3c95c", "metadata": {}, @@ -55,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 1, "id": "95b57b2c", "metadata": {}, "outputs": [], @@ -174,7 +173,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 2, "id": "cba85ada", "metadata": {}, "outputs": [], @@ -213,7 +212,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 3, "id": "9cf91317", "metadata": {}, "outputs": [ @@ -269,7 +268,7 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 4, "id": "d35ab066", "metadata": {}, "outputs": [], @@ -312,72 +311,10 @@ }, { "cell_type": "markdown", - "id": "e7457bae", + "id": "7d60c673", "metadata": {}, "source": [ - "# Download and convert" - ] - }, - { - "cell_type": "markdown", - "id": "54b0fd63", - "metadata": {}, - "source": [ - "We firstly import pandas, which we'll use to download the TSV files from Google Cloud Storage, and any other libraries that we'll need." - ] - }, - { - "cell_type": "code", - "execution_count": 86, - "id": "9317d4b4", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import json" - ] - }, - { - "cell_type": "markdown", - "id": "62dc4e18", - "metadata": {}, - "source": [ - "The following is a simple function to take the data (which has two columns) and convert it to a tree with a root note (question) and one child (answer)." - ] - }, - { - "cell_type": "code", - "execution_count": 87, - "id": "963e0d92", - "metadata": {}, - "outputs": [], - "source": [ - "def convert_unified_qa(dataset_url):\n", - " # download using pandas\n", - " ds = pd.read_csv(dataset_url, on_bad_lines=\"skip\", names=[\"Question\", \"Answer\"], sep=\"\\t\")\n", - " # get name for metatdata\n", - " ds_name = dataset_url.split(\"/unifiedqa/data/\")[1].split(\"/\")[0]\n", - "\n", - " # create conversation forest\n", - " conversation_forest = []\n", - " for item in ds.itertuples():\n", - " # build nodes and tree\n", - " root = ConversationTreeNode(text=item.Question, role=\"prompter\", children=[], metadata=None)\n", - " child = ConversationTreeNode(text=item.Answer, role=\"assistant\", children=[], metadata=None)\n", - " root.children.append(child)\n", - " conversation_tree = ConversationTree(root=root, metadata={\"dataset\": ds_name})\n", - "\n", - " conversation_forest.append(conversation_tree)\n", - "\n", - " conversation_forest_json = [\n", - " json.loads(TreeEncoder().encode(conversation_tree)) for conversation_tree in conversation_forest\n", - " ]\n", - "\n", - " print(json.dumps(conversation_forest_json, indent=4), file=open(f\"./{ds_name}.json\", \"w+\"))\n", - "\n", - " print(\"*****\", ds_name, \"****\")\n", - " print(ds.head(2))\n", - " print(\"....\")" + "# Manually Get URLs" ] }, { @@ -390,7 +327,7 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 23, "id": "43e188b6", "metadata": {}, "outputs": [], @@ -410,7 +347,741 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 24, + "id": "12603bcb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['natural_questions',\n", + " 'narrativeqa',\n", + " 'newsqa',\n", + " 'drop',\n", + " 'commonsenseqa',\n", + " 'physical_iqa',\n", + " 'social_iqa',\n", + " 'boolq',\n", + " 'boolq_np']" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset_names = [url[len(\"https://storage.googleapis.com/unifiedqa/data/\") :].split(\"/\")[0] for url in urls]\n", + "dataset_names" + ] + }, + { + "cell_type": "markdown", + "id": "afedc8d0", + "metadata": {}, + "source": [ + "## Convert each dataset to a Prompt-Response pair" + ] + }, + { + "cell_type": "markdown", + "id": "c5600504", + "metadata": {}, + "source": [ + "We'll now create a dictionary of lists: for each dataset index (i) we will have a list that will hold templates (j)" + ] + }, + { + "cell_type": "code", + "execution_count": 236, + "id": "5c57dbcf", + "metadata": {}, + "outputs": [], + "source": [ + "converter_functions = {}" + ] + }, + { + "cell_type": "markdown", + "id": "2c3098d2", + "metadata": {}, + "source": [ + "## 1. Natural Questions" + ] + }, + { + "cell_type": "markdown", + "id": "c17a51c4", + "metadata": {}, + "source": [ + "Dataset has short answers but it the questions are framed as natural questions, as the data set name would imply." + ] + }, + { + "cell_type": "code", + "execution_count": 237, + "id": "e35ccd5b", + "metadata": {}, + "outputs": [], + "source": [ + "converter_functions[\"natural_questions\"] = [lambda a, b: [a, b]]" + ] + }, + { + "cell_type": "markdown", + "id": "7522bc8a", + "metadata": {}, + "source": [ + "## 2. Narrative QA" + ] + }, + { + "cell_type": "code", + "execution_count": 238, + "id": "fa30ceb1", + "metadata": {}, + "outputs": [], + "source": [ + "def nar_qa_1(q, a):\n", + " return [q, a]\n", + "\n", + "\n", + "def nar_qa_2(q, a):\n", + " conv = []\n", + " conv.append(\"I am going to be asking you some questions on the following text:\" + q.split(\"\\\\n\")[1])\n", + " conv.append(\"Okay, what question do you have about the text?\")\n", + " conv.append(q.split(\"\\\\n\")[0])\n", + " conv.append(a)\n", + " return conv\n", + "\n", + "\n", + "def nar_qa_3(q, a):\n", + " conv = []\n", + " conv.append(\"I am going to be asking you some questions about the following text\")\n", + " conv.append(\n", + " \"Sure, I can help you with understanding and analyzing a text. What is the text that you would like me to work on?\"\n", + " )\n", + " conv.append(q.split(\"\\\\n\")[1])\n", + " conv.append(\"Okay, what question do you have about the text?\")\n", + " conv.append(q.split(\"\\\\n\")[0])\n", + " conv.append(a)\n", + " return conv\n", + "\n", + "\n", + "def nar_qa_4(q, a):\n", + " conv = []\n", + " conv.append(\"I have a text that I need help with\")\n", + " conv.append(\n", + " \"I can help you with understanding and analyzing a text. What is the text that you would like me to work on?\"\n", + " )\n", + " conv.append(q.split(\"\\\\n\")[1])\n", + " conv.append(\"Okay, what question do you have about the text?\")\n", + " conv.append(q.split(\"\\\\n\")[0])\n", + " conv.append(a)\n", + " return conv\n", + "\n", + "\n", + "def nar_qa_5(q, a):\n", + " conv = []\n", + " conv.append(\"Can you help me answer questions about a text?\")\n", + " conv.append(\n", + " \"Yes, as I can help you with understanding and analyzing a text. What is the text that you would like me to work on?\"\n", + " )\n", + " conv.append(q.split(\"\\\\n\")[1])\n", + " conv.append(\"Okay, what question do you have about the text?\")\n", + " conv.append(q.split(\"\\\\n\")[0])\n", + " conv.append(a)\n", + " return conv\n", + "\n", + "\n", + "def nar_qa_6(q, a):\n", + " conv = []\n", + " conv.append(\"Based on the text that I will give you, please answer the following question: \" + q.split(\"\\\\n\")[0])\n", + " conv.append(\n", + " \"Okay sure, as I can help you with answering the question '\"\n", + " + q.split(\"\\\\n\")[0]\n", + " + \"'. What text should I use to answer this question?\"\n", + " )\n", + " conv.append(q.split(\"\\\\n\")[1])\n", + " conv.append(a)\n", + " return conv\n", + "\n", + "\n", + "templates_nar_qa = [nar_qa_1, nar_qa_2, nar_qa_3, nar_qa_4, nar_qa_5, nar_qa_6]\n", + "converter_functions[\"narrativeqa\"] = templates_nar_qa" + ] + }, + { + "cell_type": "markdown", + "id": "7107046b", + "metadata": {}, + "source": [ + "## 3. News QA" + ] + }, + { + "cell_type": "code", + "execution_count": 239, + "id": "bd5d25f6", + "metadata": {}, + "outputs": [], + "source": [ + "def news_qa_1(q, a):\n", + " return [q, a]\n", + "\n", + "\n", + "def news_qa_2(q, a):\n", + " conv = []\n", + " question, context = q.split(\"\\\\n\")\n", + " try:\n", + " context = context.split(\"-- \")[1]\n", + " except:\n", + " context = context\n", + " conv.append(\"I am going to be asking you some questions on the following text:\" + context)\n", + " conv.append(\"Okay, what question do you have about the text?\")\n", + " conv.append(question)\n", + " conv.append(a)\n", + " return conv\n", + "\n", + "\n", + "def news_qa_3(q, a):\n", + " conv = []\n", + " question, context = q.split(\"\\\\n\")\n", + " try:\n", + " context = context.split(\"-- \")[1]\n", + " except:\n", + " context = context\n", + " conv.append(\"I am going to be asking you some questions about the following text\")\n", + " conv.append(\n", + " \"Sure, I can help you with understanding and analyzing a text. What is the text that you would like me to work on?\"\n", + " )\n", + " conv.append(context)\n", + " conv.append(\"Okay, what question do you have about the text?\")\n", + " conv.append(question)\n", + " conv.append(a)\n", + " return conv\n", + "\n", + "\n", + "def news_qa_4(q, a):\n", + " conv = []\n", + " question, context = q.split(\"\\\\n\")\n", + " try:\n", + " context = context.split(\"-- \")[1]\n", + " except:\n", + " context = context\n", + " conv.append(\"I have a text that I need help with\")\n", + " conv.append(\n", + " \"I can help you with understanding and analyzing a text. What is the text that you would like me to work on?\"\n", + " )\n", + " conv.append(context)\n", + " conv.append(\"Okay, what question do you have about the text?\")\n", + " conv.append(question)\n", + " conv.append(a)\n", + " return conv\n", + "\n", + "\n", + "def news_qa_5(q, a):\n", + " conv = []\n", + " question, context = q.split(\"\\\\n\")\n", + " try:\n", + " context = context.split(\"-- \")[1]\n", + " except:\n", + " context = context\n", + " conv.append(\"Can you help me answer questions about a text?\")\n", + " conv.append(\n", + " \"Yes, as I can help you with understanding and analyzing a text. What is the text that you would like me to work on?\"\n", + " )\n", + " conv.append(context)\n", + " conv.append(\"Okay, what question do you have about the text?\")\n", + " conv.append(question)\n", + " conv.append(a)\n", + " return conv\n", + "\n", + "\n", + "def news_qa_6(q, a):\n", + " conv = []\n", + " question, context = q.split(\"\\\\n\")\n", + " try:\n", + " context = context.split(\"-- \")[1]\n", + " except:\n", + " context = context\n", + " conv.append(\"Based on the text that I will give you, please answer the following question: \" + question)\n", + " conv.append(\n", + " \"Okay sure, as I can help you with answering the question '\"\n", + " + question\n", + " + \"'. What text should I use to answer this question?\"\n", + " )\n", + " conv.append(context)\n", + " conv.append(a)\n", + " return conv\n", + "\n", + "\n", + "templates_news_qa = [news_qa_1, news_qa_2, news_qa_3, news_qa_4, news_qa_5, news_qa_6]\n", + "converter_functions[\"newsqa\"] = templates_news_qa" + ] + }, + { + "cell_type": "markdown", + "id": "dd5ac9ae", + "metadata": {}, + "source": [ + "## 4. Drop" + ] + }, + { + "cell_type": "code", + "execution_count": 240, + "id": "b73fa5ad", + "metadata": {}, + "outputs": [], + "source": [ + "def drop_qa_1(q, a):\n", + " return [q, a]\n", + "\n", + "\n", + "def drop_qa_2(q, a):\n", + " conv = []\n", + " conv.append(\"I am going to be asking you some questions on the following text:\" + q.split(\"\\\\n\")[1])\n", + " conv.append(\"Okay, what question do you have about the text?\")\n", + " conv.append(q.split(\"\\\\n\")[0])\n", + " conv.append(a)\n", + " return conv\n", + "\n", + "\n", + "def drop_qa_3(q, a):\n", + " conv = []\n", + " conv.append(\"I am going to be asking you some questions about the following text\")\n", + " conv.append(\n", + " \"Sure, I can help you with understanding and analyzing a text. What is the text that you would like me to work on?\"\n", + " )\n", + " conv.append(q.split(\"\\\\n\")[1])\n", + " conv.append(\"Okay, what question do you have about the text?\")\n", + " conv.append(q.split(\"\\\\n\")[0])\n", + " conv.append(a)\n", + " return conv\n", + "\n", + "\n", + "def drop_qa_4(q, a):\n", + " conv = []\n", + " conv.append(\"I have a text that I need help with\")\n", + " conv.append(\n", + " \"I can help you with understanding and analyzing a text. What is the text that you would like me to work on?\"\n", + " )\n", + " conv.append(q.split(\"\\\\n\")[1])\n", + " conv.append(\"Okay, what question do you have about the text?\")\n", + " conv.append(q.split(\"\\\\n\")[0])\n", + " conv.append(a)\n", + " return conv\n", + "\n", + "\n", + "def drop_qa_5(q, a):\n", + " conv = []\n", + " conv.append(\"Can you help me answer questions about a text?\")\n", + " conv.append(\n", + " \"Yes, as I can help you with understanding and analyzing a text. What is the text that you would like me to work on?\"\n", + " )\n", + " conv.append(q.split(\"\\\\n\")[1])\n", + " conv.append(\"Okay, what question do you have about the text?\")\n", + " conv.append(q.split(\"\\\\n\")[0])\n", + " conv.append(a)\n", + " return conv\n", + "\n", + "\n", + "def drop_qa_6(q, a):\n", + " conv = []\n", + " conv.append(\"Based on the text that I will give you, please answer the following question: \" + q.split(\"\\\\n\")[0])\n", + " conv.append(\n", + " \"Okay sure, as I can help you with answering the question '\"\n", + " + q.split(\"\\\\n\")[0]\n", + " + \"'. What text should I use to answer this question?\"\n", + " )\n", + " conv.append(q.split(\"\\\\n\")[1])\n", + " conv.append(a)\n", + " return conv\n", + "\n", + "\n", + "templates_drop_qa = [drop_qa_1, drop_qa_2, drop_qa_3, drop_qa_4, drop_qa_5, drop_qa_6]\n", + "converter_functions[\"drop\"] = templates_drop_qa" + ] + }, + { + "cell_type": "markdown", + "id": "90ada37d", + "metadata": {}, + "source": [ + "## 5. CommonsenseQA" + ] + }, + { + "cell_type": "code", + "execution_count": 241, + "id": "ea5b4d68", + "metadata": {}, + "outputs": [], + "source": [ + "def cs_qa_1(q, a):\n", + " return [q, a]\n", + "\n", + "\n", + "def cs_qa_2(q, a):\n", + " conv = []\n", + " conv.append(\"I have a multiple choice question that I need help with\")\n", + " conv.append(\"Okay, I can help you with multiple choice questions. Please provide the question.\")\n", + " conv.append(q)\n", + " conv.append(\"The answer is: \" + a)\n", + " return conv\n", + "\n", + "\n", + "def cs_qa_3(q, a):\n", + " conv = []\n", + " conv.append(\"I have some common sense questions for you to answer.\")\n", + " conv.append(\"Okay, I can try to answer your questions while using common sense. Please provide the question.\")\n", + " conv.append(q)\n", + " conv.append(\"The commmon sense answer would be: \" + a)\n", + " return conv\n", + "\n", + "\n", + "templates_cs_qa = [cs_qa_1, cs_qa_2, cs_qa_3]\n", + "converter_functions[\"commonsenseqa\"] = templates_cs_qa" + ] + }, + { + "cell_type": "markdown", + "id": "661ca13b", + "metadata": {}, + "source": [ + "## 6. Physical IQA" + ] + }, + { + "cell_type": "code", + "execution_count": 242, + "id": "2ed37170", + "metadata": {}, + "outputs": [], + "source": [ + "def ph_qa_1(q, a):\n", + " return [q, a]\n", + "\n", + "\n", + "def ph_qa_2(q, a):\n", + " conv = []\n", + " conv.append(\"I have a multiple choice question that I need help with\")\n", + " conv.append(\"Okay, I can help you with multiple choice questions. Please provide the question.\")\n", + " conv.append(q)\n", + " conv.append(\"The answer is: \" + a)\n", + " return conv\n", + "\n", + "\n", + "def ph_qa_3(q, a):\n", + " conv = []\n", + " conv.append(\"Can I ask you a question?\")\n", + " conv.append(\"Sure, you can ask me a question! I'll try my best to answer it.\")\n", + " conv.append(q)\n", + " conv.append(\"I think the answer is: \" + a)\n", + " return conv\n", + "\n", + "\n", + "def ph_qa_4(q, a):\n", + " return [q.split(\"\\\\n\")[0], a]\n", + "\n", + "\n", + "templates_ph_qa = [ph_qa_1, ph_qa_2, ph_qa_3, ph_qa_4]\n", + "converter_functions[\"physical_iqa\"] = templates_ph_qa" + ] + }, + { + "cell_type": "markdown", + "id": "eb9df9c1", + "metadata": {}, + "source": [ + "## 7. Social IQA" + ] + }, + { + "cell_type": "code", + "execution_count": 243, + "id": "0574a531", + "metadata": {}, + "outputs": [], + "source": [ + "def so_qa_1(q, a):\n", + " return [q, a]\n", + "\n", + "\n", + "def so_qa_2(q, a):\n", + " conv = []\n", + " conv.append(\"I have a multiple choice question that I need help with\")\n", + " conv.append(\"Okay, I can help you with multiple choice questions. Please provide the question.\")\n", + " conv.append(q)\n", + " conv.append(\"The answer is: \" + a)\n", + " return conv\n", + "\n", + "\n", + "def so_qa_3(q, a):\n", + " conv = []\n", + " conv.append(\"Can I ask you a question?\")\n", + " conv.append(\"Sure, you can ask me a question! I'll try my best to answer it.\")\n", + " conv.append(q)\n", + " conv.append(\"I think the answer is: \" + a)\n", + " return conv\n", + " return conv\n", + "\n", + "\n", + "def so_qa_4(q, a):\n", + " conv = []\n", + " ques, options, context = q.split(\"\\\\n\")\n", + " conv.append(\"I have a question about this text:\" + context)\n", + " conv.append(\"Okay, what question do you have?\")\n", + " conv.append(ques)\n", + " conv.append(a)\n", + " return conv\n", + "\n", + "\n", + "def so_qa_5(q, a):\n", + " conv = []\n", + " ques, options, context = q.split(\"\\\\n\")\n", + " conv.append(\"I have a question about this text:\" + context)\n", + " conv.append(\"Okay, what question do you have?\")\n", + " conv.append(ques + \"\\\\n\" + options)\n", + " conv.append(a)\n", + " return conv\n", + "\n", + "\n", + "def so_qa_6(q, a):\n", + " conv = []\n", + " ques, options, context = q.split(\"\\\\n\")\n", + " conv.append(\"Based on the text that I will provide, please answer the following question:\" + ques)\n", + " conv.append(\"Okay, what text can I use to derive the answer?\")\n", + " conv.append(context)\n", + " conv.append(a)\n", + " return conv\n", + "\n", + "\n", + "templates_so_qa = [so_qa_1, so_qa_2, so_qa_3, so_qa_4, so_qa_5, so_qa_6]\n", + "converter_functions[\"social_iqa\"] = templates_so_qa" + ] + }, + { + "cell_type": "markdown", + "id": "53158de8", + "metadata": {}, + "source": [ + "## 8. BoolQ" + ] + }, + { + "cell_type": "code", + "execution_count": 244, + "id": "cd9b20d8", + "metadata": {}, + "outputs": [], + "source": [ + "def bq_qa_1(q, a):\n", + " return [q, a]\n", + "\n", + "\n", + "def bq_qa_2(q, a):\n", + " ques, context = q.split(\"\\\\n\")\n", + " conv = []\n", + " conv.append(ques)\n", + " conv.append(a.capitalize() + \". \" + context)\n", + " return conv\n", + "\n", + "\n", + "def bq_qa_3(q, a):\n", + " ques, context = q.split(\"\\\\n\")\n", + " conv = []\n", + " conv.append(\"Based on the following text, please answer my questions: \" + context)\n", + " conv.append(\"Sure, what question do you have?\")\n", + " conv.append(ques)\n", + " conv.append(\"Based on the text above, the answer is: \" + a)\n", + " return conv\n", + "\n", + "\n", + "templates_bq_qa = [bq_qa_1, bq_qa_2, bq_qa_3]\n", + "converter_functions[\"boolq\"] = templates_bq_qa" + ] + }, + { + "cell_type": "markdown", + "id": "6a1d741f", + "metadata": {}, + "source": [ + "## 9. BoolQ NP" + ] + }, + { + "cell_type": "code", + "execution_count": 245, + "id": "f71f75b8", + "metadata": {}, + "outputs": [], + "source": [ + "converter_functions[\"boolq_np\"] = templates_bq_qa" + ] + }, + { + "cell_type": "markdown", + "id": "760bb9b4", + "metadata": {}, + "source": [ + "## Helper Functions" + ] + }, + { + "cell_type": "code", + "execution_count": 252, + "id": "335a8e05", + "metadata": {}, + "outputs": [], + "source": [ + "## Quality assurance function\n", + "def is_valid_conversation(my_conv, q, a, verbose=False):\n", + " if not len(my_conv) % 2 == 0:\n", + " if verbose:\n", + " print(\"Uneven number of entries in\")\n", + " print(q[:1000])\n", + " print(a)\n", + " return False\n", + " if not all(isinstance(item, str) for item in my_conv):\n", + " if verbose:\n", + " print(\"Non-str entries in\")\n", + " print(q[:1000])\n", + " print(a)\n", + " return False\n", + " return True" + ] + }, + { + "cell_type": "code", + "execution_count": 253, + "id": "001b32a8", + "metadata": {}, + "outputs": [], + "source": [ + "def print_conv(root):\n", + " if root.text != None:\n", + " print(root.text[:100])\n", + " if len(root.children) > 0:\n", + " print_conv(root.children[0])\n", + " return \"\"" + ] + }, + { + "cell_type": "markdown", + "id": "e7457bae", + "metadata": {}, + "source": [ + "# Download and Save as Raw Inputs" + ] + }, + { + "cell_type": "markdown", + "id": "54b0fd63", + "metadata": {}, + "source": [ + "We firstly import pandas, which we'll use to download the TSV files from Google Cloud Storage, and any other libraries that we'll need." + ] + }, + { + "cell_type": "code", + "execution_count": 254, + "id": "9317d4b4", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import json\n", + "import random\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 259, + "id": "0fea67d2", + "metadata": {}, + "outputs": [], + "source": [ + "random.seed(20) # for reproduciablity" + ] + }, + { + "cell_type": "markdown", + "id": "62dc4e18", + "metadata": {}, + "source": [ + "The following is a simple function to take the data (which has two columns) and convert it to a tree with a root note (question) and one child (answer)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "963e0d92", + "metadata": {}, + "outputs": [], + "source": [ + "def convert_unified_qa(dataset_url):\n", + " # download using pandas\n", + " ds = pd.read_csv(dataset_url, on_bad_lines=\"skip\", names=[\"Question\", \"Answer\"], sep=\"\\t\")\n", + " # get name for metatdata\n", + " ds_name = dataset_url.split(\"/unifiedqa/data/\")[1].split(\"/\")[0]\n", + " # get conversation templates list\n", + " conv_funcs = converter_functions[ds_name]\n", + "\n", + " # create conversation forest\n", + " conversation_forest = []\n", + " for item in ds.itertuples():\n", + " # get q,a from table\n", + " question = item.Question\n", + " answer = item.Answer\n", + " if question == np.nan or answer == np.nan:\n", + " print(\"Skipped\")\n", + " # get a random conversation generatore function\n", + " conv_func = random.choice(conv_funcs)\n", + " try:\n", + " conv_list = conv_func(question, answer)\n", + " except:\n", + " print(\"!!!!!!!!!!!! Skipped one example\")\n", + " # print(conv_func)\n", + " # print(question)\n", + " # print(answer)\n", + " continue\n", + " if not is_valid_conversation(conv_list, item.Question, item.Answer):\n", + " print(\"!!!!!!!!!!!! Skipped one example\")\n", + " continue\n", + " # build nodes and tree\n", + " root = ConversationTreeNode(text=conv_list[0], role=\"prompter\", children=[], metadata=None)\n", + " prev_node = root\n", + " for i in range(1, len(conv_list)):\n", + " role = \"prompter\"\n", + " if i % 2 == 1:\n", + " role = \"assistant\"\n", + " next_node = ConversationTreeNode(text=conv_list[i], role=\"assistant\", children=[], metadata=None)\n", + " prev_node.children.append(next_node)\n", + " prev_node = next_node\n", + " conversation_tree = ConversationTree(root=root, metadata={\"dataset\": ds_name})\n", + "\n", + " # save the tree to the forest\n", + " conversation_forest.append(conversation_tree)\n", + "\n", + " conversation_forest_json = [\n", + " json.loads(TreeEncoder().encode(conversation_tree)) for conversation_tree in conversation_forest\n", + " ]\n", + "\n", + " print(json.dumps(conversation_forest_json, indent=4), file=open(f\"./{ds_name}.json\", \"w+\"))\n", + "\n", + " print(\"Finished converting dataset\")\n", + " print(\" \")\n", + " print(\"*****\", ds_name, \"****\")\n", + " # print(ds.head(2))\n", + " print(print_conv(conversation_forest[0].root))" + ] + }, + { + "cell_type": "code", + "execution_count": 261, "id": "b39bb154", "metadata": {}, "outputs": [ @@ -418,63 +1089,75 @@ "name": "stdout", "output_type": "stream", "text": [ + "!!!!!!!!!!!! Skipped one example\n", + "Finished converting dataset\n", + " \n", "***** natural_questions ****\n", - " Question \\\n", - "0 which is the most common use of opt-in e-mail ... \n", - "1 how i.met your mother who is the mother? \n", + "which is the most common use of opt-in e-mail marketing?\n", + "a newsletter sent to an advertising firm's customers\n", "\n", - " Answer \n", - "0 a newsletter sent to an advertising firm's cus... \n", - "1 Tracy McConnell \n", - "....\n", + "Finished converting dataset\n", + " \n", "***** narrativeqa ****\n", - " Question \\\n", - "0 Who is Miss Delmer? \\n At Madeline Hall, an o... \n", - "1 Who is Miss Delmer? \\n At Madeline Hall, an o... \n", + "I am going to be asking you some questions about the following text\n", + "Sure, I can help you with understanding and analyzing a text. What is the text that you would like m\n", + " At Madeline Hall, an old mansion-house near Southampton belonging to the wealthy de Versely family\n", + "Okay, what question do you have about the text?\n", + "Who is Miss Delmer? \n", + " the elderly spinster aunt of the Earl de Verseley and Captain Delmar \n", "\n", - " Answer \n", - "0 the elderly spinster aunt of the Earl de Vers... \n", - "1 She's Captail Delmar's aunt. \n", - "....\n", + "!!!!!!!!!!!! Skipped one example\n", + "Finished converting dataset\n", + " \n", "***** newsqa ****\n", - " Question Answer\n", - "0 How many Americans are part of the federal foo... 31 million\n", - "1 How much did Sean Callebs live on? \\n (CNN) --... $176\n", - "....\n", - "***** drop ****\n", - " Question Answer\n", - "0 How many points did the buccaneers need to tie... 3\n", - "1 How many field goals did the Lions score? \\n T... 2\n", - "....\n", - "***** commonsenseqa ****\n", - " Question Answer\n", - "0 The sanctions against the school were a punish... ignore\n", - "1 Sammy wanted to go to where the people were. ... populated areas\n", - "....\n", - "***** physical_iqa ****\n", - " Question \\\n", - "0 When boiling butter, when it's ready, you can ... \n", - "1 To permanently attach metal legs to a chair, y... \n", + "How many Americans are part of the federal food assistance program? \\n (CNN) -- As Walter Thomas kno\n", + "31 million\n", "\n", - " Answer \n", - "0 Pour it into a jar \n", - "1 Weld the metal together to get it to stay fir... \n", - "....\n", + "Finished converting dataset\n", + " \n", + "***** drop ****\n", + "I am going to be asking you some questions on the following text: To start the season, the Lions tra\n", + "Okay, what question do you have about the text?\n", + "How many points did the buccaneers need to tie in the first? \n", + "3\n", + "\n", + "Finished converting dataset\n", + " \n", + "***** commonsenseqa ****\n", + "The sanctions against the school were a punishing blow, and they seemed to what the efforts the scho\n", + "ignore\n", + "\n", + "!!!!!!!!!!!! Skipped one example\n", + "!!!!!!!!!!!! Skipped one example\n", + "!!!!!!!!!!!! Skipped one example\n", + "Finished converting dataset\n", + " \n", + "***** physical_iqa ****\n", + "When boiling butter, when it's ready, you can \n", + " Pour it into a jar\n", + "\n", + "Finished converting dataset\n", + " \n", "***** social_iqa ****\n", - " Question Answer\n", - "0 How would Others feel as a result? \\n (A) like... like attending\n", - "1 What will Others want to do next? \\n (A) disag... get to work\n", - "....\n", + "I have a multiple choice question that I need help with\n", + "Okay, I can help you with multiple choice questions. Please provide the question.\n", + "How would Others feel as a result? \\n (A) like attending (B) like staying home (C) a good friend to \n", + "The answer is: like attending\n", + "\n", + "Finished converting dataset\n", + " \n", "***** boolq ****\n", - " Question Answer\n", - "0 can you buy beer at a grocery store in pa? \\n ... yes\n", - "1 is the baby in fuller house a twin? \\n (List o... yes\n", - "....\n", + "Based on the following text, please answer my questions: (Alcohol laws of Pennsylvania) Some superm\n", + "Sure, what question do you have?\n", + "can you buy beer at a grocery store in pa? \n", + "Based on the text above, the answer is: yes\n", + "\n", + "Finished converting dataset\n", + " \n", "***** boolq_np ****\n", - " Question Answer\n", - "0 do iran and afghanistan write the same languag... yes\n", - "1 do iran and afghanistan read the same language... yes\n", - "....\n" + "do iran and afghanistan write the same language?\\n(Persian language) Persian (/ˈpɜːrʒən, -ʃən/), als\n", + "yes\n", + "\n" ] } ], @@ -486,7 +1169,15 @@ { "cell_type": "code", "execution_count": null, - "id": "f0309674", + "id": "ffa48c56", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cbabc707", "metadata": {}, "outputs": [], "source": [] @@ -494,7 +1185,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -508,7 +1199,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4 (tags/v3.7.4:e09359112e, Jul 8 2019, 20:34:20) [MSC v.1916 64 bit (AMD64)]" + "version": "3.10.9" }, "vscode": { "interpreter": {