From 091e93392bef64f51234fcdf2902463a0585bdac Mon Sep 17 00:00:00 2001
From: Valentino <vale95ntino@users.noreply.github.com>
Date: Mon, 30 Jan 2023 03:26:11 -0800
Subject: [PATCH] added custom functions for each dataset to conver QA pais to
 Instruct-Reponse conversations (#922)

---
 .../unified-qa/unified-qa.ipynb               | 935 +++++++++++++++---
 1 file changed, 813 insertions(+), 122 deletions(-)

diff --git a/notebooks/data-augmentation/unified-qa/unified-qa.ipynb b/notebooks/data-augmentation/unified-qa/unified-qa.ipynb
index 37c85b3d..f19b8746 100644
--- a/notebooks/data-augmentation/unified-qa/unified-qa.ipynb
+++ b/notebooks/data-augmentation/unified-qa/unified-qa.ipynb
@@ -9,7 +9,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "b2e3c95c",
    "metadata": {},
@@ -55,7 +54,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 82,
+   "execution_count": 1,
    "id": "95b57b2c",
    "metadata": {},
    "outputs": [],
@@ -174,7 +173,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 83,
+   "execution_count": 2,
    "id": "cba85ada",
    "metadata": {},
    "outputs": [],
@@ -213,7 +212,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 84,
+   "execution_count": 3,
    "id": "9cf91317",
    "metadata": {},
    "outputs": [
@@ -269,7 +268,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 85,
+   "execution_count": 4,
    "id": "d35ab066",
    "metadata": {},
    "outputs": [],
@@ -312,72 +311,10 @@
   },
   {
    "cell_type": "markdown",
-   "id": "e7457bae",
+   "id": "7d60c673",
    "metadata": {},
    "source": [
-    "# Download and convert"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "54b0fd63",
-   "metadata": {},
-   "source": [
-    "We firstly import pandas, which we'll use to download the TSV files from Google Cloud Storage, and any other libraries that we'll need."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 86,
-   "id": "9317d4b4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import json"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "62dc4e18",
-   "metadata": {},
-   "source": [
-    "The following is a simple function to take the data (which has two columns) and convert it to a tree with a root note (question) and one child (answer)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 87,
-   "id": "963e0d92",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def convert_unified_qa(dataset_url):\n",
-    "    # download using pandas\n",
-    "    ds = pd.read_csv(dataset_url, on_bad_lines=\"skip\", names=[\"Question\", \"Answer\"], sep=\"\\t\")\n",
-    "    # get name for metatdata\n",
-    "    ds_name = dataset_url.split(\"/unifiedqa/data/\")[1].split(\"/\")[0]\n",
-    "\n",
-    "    # create conversation forest\n",
-    "    conversation_forest = []\n",
-    "    for item in ds.itertuples():\n",
-    "        # build nodes and tree\n",
-    "        root = ConversationTreeNode(text=item.Question, role=\"prompter\", children=[], metadata=None)\n",
-    "        child = ConversationTreeNode(text=item.Answer, role=\"assistant\", children=[], metadata=None)\n",
-    "        root.children.append(child)\n",
-    "        conversation_tree = ConversationTree(root=root, metadata={\"dataset\": ds_name})\n",
-    "\n",
-    "        conversation_forest.append(conversation_tree)\n",
-    "\n",
-    "    conversation_forest_json = [\n",
-    "        json.loads(TreeEncoder().encode(conversation_tree)) for conversation_tree in conversation_forest\n",
-    "    ]\n",
-    "\n",
-    "    print(json.dumps(conversation_forest_json, indent=4), file=open(f\"./{ds_name}.json\", \"w+\"))\n",
-    "\n",
-    "    print(\"*****\", ds_name, \"****\")\n",
-    "    print(ds.head(2))\n",
-    "    print(\"....\")"
+    "# Manually Get URLs"
    ]
   },
   {
@@ -390,7 +327,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 88,
+   "execution_count": 23,
    "id": "43e188b6",
    "metadata": {},
    "outputs": [],
@@ -410,7 +347,741 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 77,
+   "execution_count": 24,
+   "id": "12603bcb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['natural_questions',\n",
+       " 'narrativeqa',\n",
+       " 'newsqa',\n",
+       " 'drop',\n",
+       " 'commonsenseqa',\n",
+       " 'physical_iqa',\n",
+       " 'social_iqa',\n",
+       " 'boolq',\n",
+       " 'boolq_np']"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset_names = [url[len(\"https://storage.googleapis.com/unifiedqa/data/\") :].split(\"/\")[0] for url in urls]\n",
+    "dataset_names"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "afedc8d0",
+   "metadata": {},
+   "source": [
+    "## Convert each dataset to a Prompt-Response pair"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c5600504",
+   "metadata": {},
+   "source": [
+    "We'll now create a dictionary of lists: for each dataset index (i) we will have a list that will hold templates (j)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 236,
+   "id": "5c57dbcf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "converter_functions = {}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2c3098d2",
+   "metadata": {},
+   "source": [
+    "## 1. Natural Questions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c17a51c4",
+   "metadata": {},
+   "source": [
+    "Dataset has short answers but it the questions are framed as natural questions, as the data set name would imply."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 237,
+   "id": "e35ccd5b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "converter_functions[\"natural_questions\"] = [lambda a, b: [a, b]]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7522bc8a",
+   "metadata": {},
+   "source": [
+    "## 2. Narrative QA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 238,
+   "id": "fa30ceb1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def nar_qa_1(q, a):\n",
+    "    return [q, a]\n",
+    "\n",
+    "\n",
+    "def nar_qa_2(q, a):\n",
+    "    conv = []\n",
+    "    conv.append(\"I am going to be asking you some questions on the following text:\" + q.split(\"\\\\n\")[1])\n",
+    "    conv.append(\"Okay, what question do you have about the text?\")\n",
+    "    conv.append(q.split(\"\\\\n\")[0])\n",
+    "    conv.append(a)\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "def nar_qa_3(q, a):\n",
+    "    conv = []\n",
+    "    conv.append(\"I am going to be asking you some questions about the following text\")\n",
+    "    conv.append(\n",
+    "        \"Sure, I can help you with understanding and analyzing a text. What is the text that you would like me to work on?\"\n",
+    "    )\n",
+    "    conv.append(q.split(\"\\\\n\")[1])\n",
+    "    conv.append(\"Okay, what question do you have about the text?\")\n",
+    "    conv.append(q.split(\"\\\\n\")[0])\n",
+    "    conv.append(a)\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "def nar_qa_4(q, a):\n",
+    "    conv = []\n",
+    "    conv.append(\"I have a text that I need help with\")\n",
+    "    conv.append(\n",
+    "        \"I can help you with understanding and analyzing a text. What is the text that you would like me to work on?\"\n",
+    "    )\n",
+    "    conv.append(q.split(\"\\\\n\")[1])\n",
+    "    conv.append(\"Okay, what question do you have about the text?\")\n",
+    "    conv.append(q.split(\"\\\\n\")[0])\n",
+    "    conv.append(a)\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "def nar_qa_5(q, a):\n",
+    "    conv = []\n",
+    "    conv.append(\"Can you help me answer questions about a text?\")\n",
+    "    conv.append(\n",
+    "        \"Yes, as I can help you with understanding and analyzing a text. What is the text that you would like me to work on?\"\n",
+    "    )\n",
+    "    conv.append(q.split(\"\\\\n\")[1])\n",
+    "    conv.append(\"Okay, what question do you have about the text?\")\n",
+    "    conv.append(q.split(\"\\\\n\")[0])\n",
+    "    conv.append(a)\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "def nar_qa_6(q, a):\n",
+    "    conv = []\n",
+    "    conv.append(\"Based on the text that I will give you, please answer the following question: \" + q.split(\"\\\\n\")[0])\n",
+    "    conv.append(\n",
+    "        \"Okay sure, as I can help you with answering the question '\"\n",
+    "        + q.split(\"\\\\n\")[0]\n",
+    "        + \"'. What text should I use to answer this question?\"\n",
+    "    )\n",
+    "    conv.append(q.split(\"\\\\n\")[1])\n",
+    "    conv.append(a)\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "templates_nar_qa = [nar_qa_1, nar_qa_2, nar_qa_3, nar_qa_4, nar_qa_5, nar_qa_6]\n",
+    "converter_functions[\"narrativeqa\"] = templates_nar_qa"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7107046b",
+   "metadata": {},
+   "source": [
+    "## 3. News QA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 239,
+   "id": "bd5d25f6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def news_qa_1(q, a):\n",
+    "    return [q, a]\n",
+    "\n",
+    "\n",
+    "def news_qa_2(q, a):\n",
+    "    conv = []\n",
+    "    question, context = q.split(\"\\\\n\")\n",
+    "    try:\n",
+    "        context = context.split(\"-- \")[1]\n",
+    "    except:\n",
+    "        context = context\n",
+    "    conv.append(\"I am going to be asking you some questions on the following text:\" + context)\n",
+    "    conv.append(\"Okay, what question do you have about the text?\")\n",
+    "    conv.append(question)\n",
+    "    conv.append(a)\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "def news_qa_3(q, a):\n",
+    "    conv = []\n",
+    "    question, context = q.split(\"\\\\n\")\n",
+    "    try:\n",
+    "        context = context.split(\"-- \")[1]\n",
+    "    except:\n",
+    "        context = context\n",
+    "    conv.append(\"I am going to be asking you some questions about the following text\")\n",
+    "    conv.append(\n",
+    "        \"Sure, I can help you with understanding and analyzing a text. What is the text that you would like me to work on?\"\n",
+    "    )\n",
+    "    conv.append(context)\n",
+    "    conv.append(\"Okay, what question do you have about the text?\")\n",
+    "    conv.append(question)\n",
+    "    conv.append(a)\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "def news_qa_4(q, a):\n",
+    "    conv = []\n",
+    "    question, context = q.split(\"\\\\n\")\n",
+    "    try:\n",
+    "        context = context.split(\"-- \")[1]\n",
+    "    except:\n",
+    "        context = context\n",
+    "    conv.append(\"I have a text that I need help with\")\n",
+    "    conv.append(\n",
+    "        \"I can help you with understanding and analyzing a text. What is the text that you would like me to work on?\"\n",
+    "    )\n",
+    "    conv.append(context)\n",
+    "    conv.append(\"Okay, what question do you have about the text?\")\n",
+    "    conv.append(question)\n",
+    "    conv.append(a)\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "def news_qa_5(q, a):\n",
+    "    conv = []\n",
+    "    question, context = q.split(\"\\\\n\")\n",
+    "    try:\n",
+    "        context = context.split(\"-- \")[1]\n",
+    "    except:\n",
+    "        context = context\n",
+    "    conv.append(\"Can you help me answer questions about a text?\")\n",
+    "    conv.append(\n",
+    "        \"Yes, as I can help you with understanding and analyzing a text. What is the text that you would like me to work on?\"\n",
+    "    )\n",
+    "    conv.append(context)\n",
+    "    conv.append(\"Okay, what question do you have about the text?\")\n",
+    "    conv.append(question)\n",
+    "    conv.append(a)\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "def news_qa_6(q, a):\n",
+    "    conv = []\n",
+    "    question, context = q.split(\"\\\\n\")\n",
+    "    try:\n",
+    "        context = context.split(\"-- \")[1]\n",
+    "    except:\n",
+    "        context = context\n",
+    "    conv.append(\"Based on the text that I will give you, please answer the following question: \" + question)\n",
+    "    conv.append(\n",
+    "        \"Okay sure, as I can help you with answering the question '\"\n",
+    "        + question\n",
+    "        + \"'. What text should I use to answer this question?\"\n",
+    "    )\n",
+    "    conv.append(context)\n",
+    "    conv.append(a)\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "templates_news_qa = [news_qa_1, news_qa_2, news_qa_3, news_qa_4, news_qa_5, news_qa_6]\n",
+    "converter_functions[\"newsqa\"] = templates_news_qa"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dd5ac9ae",
+   "metadata": {},
+   "source": [
+    "## 4. Drop"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 240,
+   "id": "b73fa5ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def drop_qa_1(q, a):\n",
+    "    return [q, a]\n",
+    "\n",
+    "\n",
+    "def drop_qa_2(q, a):\n",
+    "    conv = []\n",
+    "    conv.append(\"I am going to be asking you some questions on the following text:\" + q.split(\"\\\\n\")[1])\n",
+    "    conv.append(\"Okay, what question do you have about the text?\")\n",
+    "    conv.append(q.split(\"\\\\n\")[0])\n",
+    "    conv.append(a)\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "def drop_qa_3(q, a):\n",
+    "    conv = []\n",
+    "    conv.append(\"I am going to be asking you some questions about the following text\")\n",
+    "    conv.append(\n",
+    "        \"Sure, I can help you with understanding and analyzing a text. What is the text that you would like me to work on?\"\n",
+    "    )\n",
+    "    conv.append(q.split(\"\\\\n\")[1])\n",
+    "    conv.append(\"Okay, what question do you have about the text?\")\n",
+    "    conv.append(q.split(\"\\\\n\")[0])\n",
+    "    conv.append(a)\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "def drop_qa_4(q, a):\n",
+    "    conv = []\n",
+    "    conv.append(\"I have a text that I need help with\")\n",
+    "    conv.append(\n",
+    "        \"I can help you with understanding and analyzing a text. What is the text that you would like me to work on?\"\n",
+    "    )\n",
+    "    conv.append(q.split(\"\\\\n\")[1])\n",
+    "    conv.append(\"Okay, what question do you have about the text?\")\n",
+    "    conv.append(q.split(\"\\\\n\")[0])\n",
+    "    conv.append(a)\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "def drop_qa_5(q, a):\n",
+    "    conv = []\n",
+    "    conv.append(\"Can you help me answer questions about a text?\")\n",
+    "    conv.append(\n",
+    "        \"Yes, as I can help you with understanding and analyzing a text. What is the text that you would like me to work on?\"\n",
+    "    )\n",
+    "    conv.append(q.split(\"\\\\n\")[1])\n",
+    "    conv.append(\"Okay, what question do you have about the text?\")\n",
+    "    conv.append(q.split(\"\\\\n\")[0])\n",
+    "    conv.append(a)\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "def drop_qa_6(q, a):\n",
+    "    conv = []\n",
+    "    conv.append(\"Based on the text that I will give you, please answer the following question: \" + q.split(\"\\\\n\")[0])\n",
+    "    conv.append(\n",
+    "        \"Okay sure, as I can help you with answering the question '\"\n",
+    "        + q.split(\"\\\\n\")[0]\n",
+    "        + \"'. What text should I use to answer this question?\"\n",
+    "    )\n",
+    "    conv.append(q.split(\"\\\\n\")[1])\n",
+    "    conv.append(a)\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "templates_drop_qa = [drop_qa_1, drop_qa_2, drop_qa_3, drop_qa_4, drop_qa_5, drop_qa_6]\n",
+    "converter_functions[\"drop\"] = templates_drop_qa"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "90ada37d",
+   "metadata": {},
+   "source": [
+    "## 5. CommonsenseQA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 241,
+   "id": "ea5b4d68",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def cs_qa_1(q, a):\n",
+    "    return [q, a]\n",
+    "\n",
+    "\n",
+    "def cs_qa_2(q, a):\n",
+    "    conv = []\n",
+    "    conv.append(\"I have a multiple choice question that I need help with\")\n",
+    "    conv.append(\"Okay, I can help you with multiple choice questions. Please provide the question.\")\n",
+    "    conv.append(q)\n",
+    "    conv.append(\"The answer is: \" + a)\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "def cs_qa_3(q, a):\n",
+    "    conv = []\n",
+    "    conv.append(\"I have some common sense questions for you to answer.\")\n",
+    "    conv.append(\"Okay, I can try to answer your questions while using common sense. Please provide the question.\")\n",
+    "    conv.append(q)\n",
+    "    conv.append(\"The commmon sense answer would be: \" + a)\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "templates_cs_qa = [cs_qa_1, cs_qa_2, cs_qa_3]\n",
+    "converter_functions[\"commonsenseqa\"] = templates_cs_qa"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "661ca13b",
+   "metadata": {},
+   "source": [
+    "## 6. Physical IQA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 242,
+   "id": "2ed37170",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def ph_qa_1(q, a):\n",
+    "    return [q, a]\n",
+    "\n",
+    "\n",
+    "def ph_qa_2(q, a):\n",
+    "    conv = []\n",
+    "    conv.append(\"I have a multiple choice question that I need help with\")\n",
+    "    conv.append(\"Okay, I can help you with multiple choice questions. Please provide the question.\")\n",
+    "    conv.append(q)\n",
+    "    conv.append(\"The answer is: \" + a)\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "def ph_qa_3(q, a):\n",
+    "    conv = []\n",
+    "    conv.append(\"Can I ask you a question?\")\n",
+    "    conv.append(\"Sure, you can ask me a question! I'll try my best to answer it.\")\n",
+    "    conv.append(q)\n",
+    "    conv.append(\"I think the answer is: \" + a)\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "def ph_qa_4(q, a):\n",
+    "    return [q.split(\"\\\\n\")[0], a]\n",
+    "\n",
+    "\n",
+    "templates_ph_qa = [ph_qa_1, ph_qa_2, ph_qa_3, ph_qa_4]\n",
+    "converter_functions[\"physical_iqa\"] = templates_ph_qa"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eb9df9c1",
+   "metadata": {},
+   "source": [
+    "## 7. Social IQA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 243,
+   "id": "0574a531",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def so_qa_1(q, a):\n",
+    "    return [q, a]\n",
+    "\n",
+    "\n",
+    "def so_qa_2(q, a):\n",
+    "    conv = []\n",
+    "    conv.append(\"I have a multiple choice question that I need help with\")\n",
+    "    conv.append(\"Okay, I can help you with multiple choice questions. Please provide the question.\")\n",
+    "    conv.append(q)\n",
+    "    conv.append(\"The answer is: \" + a)\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "def so_qa_3(q, a):\n",
+    "    conv = []\n",
+    "    conv.append(\"Can I ask you a question?\")\n",
+    "    conv.append(\"Sure, you can ask me a question! I'll try my best to answer it.\")\n",
+    "    conv.append(q)\n",
+    "    conv.append(\"I think the answer is: \" + a)\n",
+    "    return conv\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "def so_qa_4(q, a):\n",
+    "    conv = []\n",
+    "    ques, options, context = q.split(\"\\\\n\")\n",
+    "    conv.append(\"I have a question about this text:\" + context)\n",
+    "    conv.append(\"Okay, what question do you have?\")\n",
+    "    conv.append(ques)\n",
+    "    conv.append(a)\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "def so_qa_5(q, a):\n",
+    "    conv = []\n",
+    "    ques, options, context = q.split(\"\\\\n\")\n",
+    "    conv.append(\"I have a question about this text:\" + context)\n",
+    "    conv.append(\"Okay, what question do you have?\")\n",
+    "    conv.append(ques + \"\\\\n\" + options)\n",
+    "    conv.append(a)\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "def so_qa_6(q, a):\n",
+    "    conv = []\n",
+    "    ques, options, context = q.split(\"\\\\n\")\n",
+    "    conv.append(\"Based on the text that I will provide, please answer the following question:\" + ques)\n",
+    "    conv.append(\"Okay, what text can I use to derive the answer?\")\n",
+    "    conv.append(context)\n",
+    "    conv.append(a)\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "templates_so_qa = [so_qa_1, so_qa_2, so_qa_3, so_qa_4, so_qa_5, so_qa_6]\n",
+    "converter_functions[\"social_iqa\"] = templates_so_qa"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "53158de8",
+   "metadata": {},
+   "source": [
+    "## 8. BoolQ"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 244,
+   "id": "cd9b20d8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def bq_qa_1(q, a):\n",
+    "    return [q, a]\n",
+    "\n",
+    "\n",
+    "def bq_qa_2(q, a):\n",
+    "    ques, context = q.split(\"\\\\n\")\n",
+    "    conv = []\n",
+    "    conv.append(ques)\n",
+    "    conv.append(a.capitalize() + \". \" + context)\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "def bq_qa_3(q, a):\n",
+    "    ques, context = q.split(\"\\\\n\")\n",
+    "    conv = []\n",
+    "    conv.append(\"Based on the following text, please answer my questions: \" + context)\n",
+    "    conv.append(\"Sure, what question do you have?\")\n",
+    "    conv.append(ques)\n",
+    "    conv.append(\"Based on the text above, the answer is: \" + a)\n",
+    "    return conv\n",
+    "\n",
+    "\n",
+    "templates_bq_qa = [bq_qa_1, bq_qa_2, bq_qa_3]\n",
+    "converter_functions[\"boolq\"] = templates_bq_qa"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6a1d741f",
+   "metadata": {},
+   "source": [
+    "## 9. BoolQ NP"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 245,
+   "id": "f71f75b8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "converter_functions[\"boolq_np\"] = templates_bq_qa"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "760bb9b4",
+   "metadata": {},
+   "source": [
+    "## Helper Functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 252,
+   "id": "335a8e05",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Quality assurance function\n",
+    "def is_valid_conversation(my_conv, q, a, verbose=False):\n",
+    "    if not len(my_conv) % 2 == 0:\n",
+    "        if verbose:\n",
+    "            print(\"Uneven number of entries in\")\n",
+    "            print(q[:1000])\n",
+    "            print(a)\n",
+    "        return False\n",
+    "    if not all(isinstance(item, str) for item in my_conv):\n",
+    "        if verbose:\n",
+    "            print(\"Non-str entries in\")\n",
+    "            print(q[:1000])\n",
+    "            print(a)\n",
+    "        return False\n",
+    "    return True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 253,
+   "id": "001b32a8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def print_conv(root):\n",
+    "    if root.text != None:\n",
+    "        print(root.text[:100])\n",
+    "    if len(root.children) > 0:\n",
+    "        print_conv(root.children[0])\n",
+    "    return \"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e7457bae",
+   "metadata": {},
+   "source": [
+    "# Download and Save as Raw Inputs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "54b0fd63",
+   "metadata": {},
+   "source": [
+    "We firstly import pandas, which we'll use to download the TSV files from Google Cloud Storage, and any other libraries that we'll need."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 254,
+   "id": "9317d4b4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import json\n",
+    "import random\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 259,
+   "id": "0fea67d2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "random.seed(20)  # for reproduciablity"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "62dc4e18",
+   "metadata": {},
+   "source": [
+    "The following is a simple function to take the data (which has two columns) and convert it to a tree with a root note (question) and one child (answer)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "963e0d92",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def convert_unified_qa(dataset_url):\n",
+    "    # download using pandas\n",
+    "    ds = pd.read_csv(dataset_url, on_bad_lines=\"skip\", names=[\"Question\", \"Answer\"], sep=\"\\t\")\n",
+    "    # get name for metatdata\n",
+    "    ds_name = dataset_url.split(\"/unifiedqa/data/\")[1].split(\"/\")[0]\n",
+    "    # get conversation templates list\n",
+    "    conv_funcs = converter_functions[ds_name]\n",
+    "\n",
+    "    # create conversation forest\n",
+    "    conversation_forest = []\n",
+    "    for item in ds.itertuples():\n",
+    "        # get q,a from table\n",
+    "        question = item.Question\n",
+    "        answer = item.Answer\n",
+    "        if question == np.nan or answer == np.nan:\n",
+    "            print(\"Skipped\")\n",
+    "        # get a random conversation generatore function\n",
+    "        conv_func = random.choice(conv_funcs)\n",
+    "        try:\n",
+    "            conv_list = conv_func(question, answer)\n",
+    "        except:\n",
+    "            print(\"!!!!!!!!!!!! Skipped one example\")\n",
+    "            #             print(conv_func)\n",
+    "            #             print(question)\n",
+    "            #             print(answer)\n",
+    "            continue\n",
+    "        if not is_valid_conversation(conv_list, item.Question, item.Answer):\n",
+    "            print(\"!!!!!!!!!!!! Skipped one example\")\n",
+    "            continue\n",
+    "        # build nodes and tree\n",
+    "        root = ConversationTreeNode(text=conv_list[0], role=\"prompter\", children=[], metadata=None)\n",
+    "        prev_node = root\n",
+    "        for i in range(1, len(conv_list)):\n",
+    "            role = \"prompter\"\n",
+    "            if i % 2 == 1:\n",
+    "                role = \"assistant\"\n",
+    "            next_node = ConversationTreeNode(text=conv_list[i], role=\"assistant\", children=[], metadata=None)\n",
+    "            prev_node.children.append(next_node)\n",
+    "            prev_node = next_node\n",
+    "        conversation_tree = ConversationTree(root=root, metadata={\"dataset\": ds_name})\n",
+    "\n",
+    "        # save the tree to the forest\n",
+    "        conversation_forest.append(conversation_tree)\n",
+    "\n",
+    "    conversation_forest_json = [\n",
+    "        json.loads(TreeEncoder().encode(conversation_tree)) for conversation_tree in conversation_forest\n",
+    "    ]\n",
+    "\n",
+    "    print(json.dumps(conversation_forest_json, indent=4), file=open(f\"./{ds_name}.json\", \"w+\"))\n",
+    "\n",
+    "    print(\"Finished converting dataset\")\n",
+    "    print(\" \")\n",
+    "    print(\"*****\", ds_name, \"****\")\n",
+    "    # print(ds.head(2))\n",
+    "    print(print_conv(conversation_forest[0].root))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 261,
    "id": "b39bb154",
    "metadata": {},
    "outputs": [
@@ -418,63 +1089,75 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "!!!!!!!!!!!! Skipped one example\n",
+      "Finished converting dataset\n",
+      " \n",
       "***** natural_questions ****\n",
-      "                                            Question  \\\n",
-      "0  which is the most common use of opt-in e-mail ...   \n",
-      "1           how i.met your mother who is the mother?   \n",
+      "which is the most common use of opt-in e-mail marketing?\n",
+      "a newsletter sent to an advertising firm's customers\n",
       "\n",
-      "                                              Answer  \n",
-      "0  a newsletter sent to an advertising firm's cus...  \n",
-      "1                                    Tracy McConnell  \n",
-      "....\n",
+      "Finished converting dataset\n",
+      " \n",
       "***** narrativeqa ****\n",
-      "                                            Question  \\\n",
-      "0  Who is Miss Delmer? \\n  At Madeline Hall, an o...   \n",
-      "1  Who is Miss Delmer? \\n  At Madeline Hall, an o...   \n",
+      "I am going to be asking you some questions about the following text\n",
+      "Sure, I can help you with understanding and analyzing a text. What is the text that you would like m\n",
+      "  At Madeline Hall, an old mansion-house near Southampton belonging to the wealthy de Versely family\n",
+      "Okay, what question do you have about the text?\n",
+      "Who is Miss Delmer? \n",
+      " the elderly spinster aunt of the Earl de Verseley and Captain Delmar \n",
       "\n",
-      "                                              Answer  \n",
-      "0   the elderly spinster aunt of the Earl de Vers...  \n",
-      "1                      She's Captail Delmar's aunt.   \n",
-      "....\n",
+      "!!!!!!!!!!!! Skipped one example\n",
+      "Finished converting dataset\n",
+      " \n",
       "***** newsqa ****\n",
-      "                                            Question      Answer\n",
-      "0  How many Americans are part of the federal foo...  31 million\n",
-      "1  How much did Sean Callebs live on? \\n (CNN) --...        $176\n",
-      "....\n",
-      "***** drop ****\n",
-      "                                            Question Answer\n",
-      "0  How many points did the buccaneers need to tie...      3\n",
-      "1  How many field goals did the Lions score? \\n T...      2\n",
-      "....\n",
-      "***** commonsenseqa ****\n",
-      "                                            Question           Answer\n",
-      "0  The sanctions against the school were a punish...           ignore\n",
-      "1  Sammy wanted to go to where the people were.  ...  populated areas\n",
-      "....\n",
-      "***** physical_iqa ****\n",
-      "                                            Question  \\\n",
-      "0  When boiling butter, when it's ready, you can ...   \n",
-      "1  To permanently attach metal legs to a chair, y...   \n",
+      "How many Americans are part of the federal food assistance program? \\n (CNN) -- As Walter Thomas kno\n",
+      "31 million\n",
       "\n",
-      "                                              Answer  \n",
-      "0                                 Pour it into a jar  \n",
-      "1   Weld the metal together to get it to stay fir...  \n",
-      "....\n",
+      "Finished converting dataset\n",
+      " \n",
+      "***** drop ****\n",
+      "I am going to be asking you some questions on the following text: To start the season, the Lions tra\n",
+      "Okay, what question do you have about the text?\n",
+      "How many points did the buccaneers need to tie in the first? \n",
+      "3\n",
+      "\n",
+      "Finished converting dataset\n",
+      " \n",
+      "***** commonsenseqa ****\n",
+      "The sanctions against the school were a punishing blow, and they seemed to what the efforts the scho\n",
+      "ignore\n",
+      "\n",
+      "!!!!!!!!!!!! Skipped one example\n",
+      "!!!!!!!!!!!! Skipped one example\n",
+      "!!!!!!!!!!!! Skipped one example\n",
+      "Finished converting dataset\n",
+      " \n",
+      "***** physical_iqa ****\n",
+      "When boiling butter, when it's ready, you can \n",
+      " Pour it into a jar\n",
+      "\n",
+      "Finished converting dataset\n",
+      " \n",
       "***** social_iqa ****\n",
-      "                                            Question           Answer\n",
-      "0  How would Others feel as a result? \\n (A) like...   like attending\n",
-      "1  What will Others want to do next? \\n (A) disag...      get to work\n",
-      "....\n",
+      "I have a multiple choice question that I need help with\n",
+      "Okay, I can help you with multiple choice questions. Please provide the question.\n",
+      "How would Others feel as a result? \\n (A) like attending (B) like staying home (C) a good friend to \n",
+      "The answer is:  like attending\n",
+      "\n",
+      "Finished converting dataset\n",
+      " \n",
       "***** boolq ****\n",
-      "                                            Question Answer\n",
-      "0  can you buy beer at a grocery store in pa? \\n ...    yes\n",
-      "1  is the baby in fuller house a twin? \\n (List o...    yes\n",
-      "....\n",
+      "Based on the following text, please answer my questions:  (Alcohol laws of Pennsylvania) Some superm\n",
+      "Sure, what question do you have?\n",
+      "can you buy beer at a grocery store in pa? \n",
+      "Based on the text above, the answer is: yes\n",
+      "\n",
+      "Finished converting dataset\n",
+      " \n",
       "***** boolq_np ****\n",
-      "                                            Question Answer\n",
-      "0  do iran and afghanistan write the same languag...    yes\n",
-      "1  do iran and afghanistan read the same language...    yes\n",
-      "....\n"
+      "do iran and afghanistan write the same language?\\n(Persian language) Persian (/ˈpɜːrʒən, -ʃən/), als\n",
+      "yes\n",
+      "\n"
      ]
     }
    ],
@@ -486,7 +1169,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "f0309674",
+   "id": "ffa48c56",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cbabc707",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -494,7 +1185,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -508,7 +1199,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.4 (tags/v3.7.4:e09359112e, Jul  8 2019, 20:34:20) [MSC v.1916 64 bit (AMD64)]"
+   "version": "3.10.9"
   },
   "vscode": {
    "interpreter": {