diff --git a/notebooks/code-bugger/openbugger_example.ipynb b/notebooks/code-bugger/openbugger_example.ipynb
index 22e9b0c8..6e2acd27 100644
--- a/notebooks/code-bugger/openbugger_example.ipynb
+++ b/notebooks/code-bugger/openbugger_example.ipynb
@@ -1,5 +1,13 @@
 {
  "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/code-bugger/openbugger_example.ipynb)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/notebooks/data-argumentation/EssayInstructions.ipynb b/notebooks/data-argumentation/EssayInstructions.ipynb
index c4179382..30834d32 100644
--- a/notebooks/data-argumentation/EssayInstructions.ipynb
+++ b/notebooks/data-argumentation/EssayInstructions.ipynb
@@ -1,5 +1,13 @@
 {
  "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/data-argumentation/EssayInstructions.ipynb)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/notebooks/data-argumentation/EssayRevision.ipynb b/notebooks/data-argumentation/EssayRevision.ipynb
index cba9bc5b..2397131c 100644
--- a/notebooks/data-argumentation/EssayRevision.ipynb
+++ b/notebooks/data-argumentation/EssayRevision.ipynb
@@ -1,5 +1,13 @@
 {
  "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/data-argumentation/EssayRevision.ipynb)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {
diff --git a/notebooks/data-argumentation/StackExchangeBuilder.ipynb b/notebooks/data-argumentation/StackExchangeBuilder.ipynb
index 625d757b..b0dd9a8b 100644
--- a/notebooks/data-argumentation/StackExchangeBuilder.ipynb
+++ b/notebooks/data-argumentation/StackExchangeBuilder.ipynb
@@ -1,28 +1,22 @@
 {
- "nbformat": 4,
- "nbformat_minor": 0,
- "metadata": {
-  "colab": {
-   "provenance": []
-  },
-  "kernelspec": {
-   "name": "python3",
-   "display_name": "Python 3"
-  },
-  "language_info": {
-   "name": "python"
-  }
- },
  "cells": [
   {
+   "attachments": {},
    "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/data-argumentation/StackExchangeBuilder.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "TB7CEfs8F-8u"
+   },
    "source": [
     "# Ingest StackExchange data dumps\n",
     "This notebook takes a StackExchange Data dump \"Posts.xml\" file and ingests it into a Pandas Dataframe. Outputs of the file can be JSON, JSONL, Parquet, or CSV. "
-   ],
-   "metadata": {
-    "id": "TB7CEfs8F-8u"
-   }
+   ]
   },
   {
    "cell_type": "code",
@@ -40,16 +34,34 @@
   },
   {
    "cell_type": "markdown",
+   "metadata": {
+    "id": "15mAL7GnzBv0"
+   },
    "source": [
     "# Extract StackExchange\n",
     "Pull StackExchange file dumps. Specific column types are enforced to prevent errors on processing later in the notebook"
-   ],
-   "metadata": {
-    "id": "15mAL7GnzBv0"
-   }
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "FtcvUEaHVxcW",
+    "outputId": "5b0cb19d-e3d9-422b-9077-52241bd09e0e"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "dict_keys(['3dprinting_meta', '3dprinting', 'Stackoverflow_com_Posts_7z', 'academia_meta', 'academia', 'ai_meta', 'ai', 'android_meta', 'android', 'anime_meta', 'anime', 'apple_meta', 'apple', 'arduino_meta', 'arduino', 'askubuntu_com_7z', 'astronomy_meta', 'astronomy', 'aviation_meta', 'aviation', 'avp_meta', 'avp', 'beer_meta', 'beer', 'bicycles_meta', 'bicycles', 'bioacoustics_meta', 'bioacoustics', 'bioinformatics_meta', 'bioinformatics', 'biology_meta', 'biology', 'bitcoin_meta', 'bitcoin', 'blender_meta', 'blender', 'boardgames_meta', 'boardgames', 'bricks_meta', 'bricks', 'buddhism_meta', 'buddhism', 'cardano_meta', 'cardano', 'chemistry_meta', 'chemistry', 'chess_meta', 'chess', 'chinese_meta', 'chinese', 'christianity_meta', 'christianity', 'civicrm_meta', 'civicrm', 'codegolf_meta', 'codegolf', 'codereview_meta', 'codereview', 'coffee_meta', 'coffee', 'cogsci_meta', 'cogsci', 'computergraphics_meta', 'computergraphics', 'conlang_meta', 'conlang', 'cooking_meta', 'cooking', 'craftcms_meta', 'craftcms', 'crafts_meta', 'crafts', 'crypto_meta', 'crypto', 'cs_meta', 'cs', 'cseducators_meta', 'cseducators', 'cstheory_meta', 'cstheory', 'datascience_meta', 'datascience', 'dba_meta', 'dba', 'devops_meta', 'devops', 'diy_meta', 'diy', 'drones_meta', 'drones', 'drupal_meta', 'drupal', 'dsp_meta', 'dsp', 'earthscience_meta', 'earthscience', 'ebooks_meta', 'ebooks', 'economics_meta', 'economics', 'electronics_meta', 'electronics', 'elementaryos_meta', 'elementaryos', 'ell_meta', 'ell', 'emacs_meta', 'emacs', 'engineering_meta', 'engineering', 'english_meta', 'english', 'eosio_meta', 'eosio', 'es_meta_stackoverflow_com_7z', 'es_stackoverflow_com_7z', 'esperanto_meta', 'esperanto', 'ethereum_meta', 'ethereum', 'expatriates_meta', 'expatriates', 'expressionengine_meta', 'expressionengine', 'fitness_meta', 'fitness', 'freelancing_meta', 'freelancing', 'french_meta', 'french', 'gamedev_meta', 'gamedev', 'gaming_meta', 'gaming', 'gardening_meta', 'gardening', 'genealogy_meta', 'genealogy', 'german_meta', 'german', 'gis_meta', 'gis', 'graphicdesign_meta', 'graphicdesign', 'ham_meta', 'ham', 'hardwarerecs_meta', 'hardwarerecs', 'health_meta', 'health', 'hermeneutics_meta', 'hermeneutics', 'hinduism_meta', 'hinduism', 'history_meta', 'history', 'homebrew_meta', 'homebrew', 'hsm_meta', 'hsm', 'interpersonal_meta', 'interpersonal', 'iot_meta', 'iot', 'iota_meta', 'iota', 'islam_meta', 'islam', 'italian_meta', 'italian', 'ja_meta_stackoverflow_com_7z', 'ja_stackoverflow_com_7z', 'japanese_meta', 'japanese', 'joomla_meta', 'joomla', 'judaism_meta', 'judaism', 'korean_meta', 'korean', 'languagelearning_meta', 'languagelearning', 'latin_meta', 'latin', 'law_meta', 'law', 'lifehacks_meta', 'lifehacks', 'linguistics_meta', 'linguistics', 'literature_meta', 'literature', 'magento_meta', 'magento', 'martialarts_meta', 'martialarts', 'materials_meta', 'materials', 'math_meta', 'math', 'matheducators_meta', 'matheducators', 'mathematica_meta', 'mathematica', 'mathoverflow_net_7z', 'mechanics_meta', 'mechanics', 'meta_askubuntu_com_7z', 'meta_mathoverflow_net_7z', 'meta_serverfault_com_7z', 'meta', 'meta_stackoverflow_com_7z', 'meta_superuser_com_7z', 'moderators_meta', 'moderators', 'monero_meta', 'monero', 'money_meta', 'money', 'movies_meta', 'movies', 'music_meta', 'music', 'musicfans_meta', 'musicfans', 'mythology_meta', 'mythology', 'networkengineering_meta', 'networkengineering', 'opendata_meta', 'opendata', 'opensource_meta', 'opensource', 'or_meta', 'or', 'outdoors_meta', 'outdoors', 'parenting_meta', 'parenting', 'patents_meta', 'patents', 'pets_meta', 'pets', 'philosophy_meta', 'philosophy', 'photo_meta', 'photo', 'physics_meta', 'physics', 'pm_meta', 'pm', 'poker_meta', 'poker', 'politics_meta', 'politics', 'portuguese_meta', 'portuguese', 'proofassistants_meta', 'proofassistants', 'pt_meta_stackoverflow_com_7z', 'pt_stackoverflow_com_7z', 'puzzling_meta', 'puzzling', 'quant_meta', 'quant', 'quantumcomputing_meta', 'quantumcomputing', 'raspberrypi_meta', 'raspberrypi', 'retrocomputing_meta', 'retrocomputing', 'reverseengineering_meta', 'reverseengineering', 'robotics_meta', 'robotics', 'rpg_meta', 'rpg', 'ru_meta_stackoverflow_com_7z', 'ru_stackoverflow_com_7z', 'rus_meta', 'rus', 'russian_meta', 'russian', 'salesforce_meta', 'salesforce', 'scicomp_meta', 'scicomp', 'scifi_meta', 'scifi', 'security_meta', 'security', 'serverfault_com_7z', 'sharepoint_meta', 'sharepoint', 'sitecore_meta', 'sitecore', 'skeptics_meta', 'skeptics', 'softwareengineering_meta', 'softwareengineering', 'softwarerecs_meta', 'softwarerecs', 'solana_meta', 'solana', 'sound_meta', 'sound', 'space_meta', 'space', 'spanish_meta', 'spanish', 'sports_meta', 'sports', 'sqa_meta', 'sqa', 'stackapps_com_7z', 'stackoverflow_com_Badges_7z', 'stackoverflow_com_Comments_7z', 'stackoverflow_com_PostHistory_7z', 'stackoverflow_com_PostLinks_7z', 'stackoverflow_com_Tags_7z', 'stackoverflow_com_Users_7z', 'stackoverflow_com_Votes_7z', 'stats_meta', 'stats', 'stellar_meta', 'stellar', 'substrate_meta', 'substrate', 'superuser_com_7z', 'sustainability_meta', 'sustainability', 'tex_meta', 'tex', 'tezos_meta', 'tezos', 'tor_meta', 'tor', 'travel_meta', 'travel', 'tridion_meta', 'tridion', 'ukrainian_meta', 'ukrainian', 'unix_meta', 'unix', 'ux_meta', 'ux', 'vegetarianism_meta', 'vegetarianism', 'vi_meta', 'vi', 'webapps_meta', 'webapps', 'webmasters_meta', 'webmasters', 'windowsphone_meta', 'windowsphone', 'woodworking_meta', 'woodworking', 'wordpress_meta', 'wordpress', 'workplace_meta', 'workplace', 'worldbuilding_meta', 'worldbuilding', 'writers_meta', 'writers'])\n",
+      "https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/ai.stackexchange.com.7z&file=Posts.xml\n"
+     ]
+    }
+   ],
    "source": [
     "base_url = \"https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/{0}&file=Posts.xml\"\n",
     "\n",
@@ -73,90 +85,11 @@
     "\n",
     "print(urls.keys())\n",
     "print(urls.get(\"ai\"))"
-   ],
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "FtcvUEaHVxcW",
-    "outputId": "5b0cb19d-e3d9-422b-9077-52241bd09e0e"
-   },
-   "execution_count": null,
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "dict_keys(['3dprinting_meta', '3dprinting', 'Stackoverflow_com_Posts_7z', 'academia_meta', 'academia', 'ai_meta', 'ai', 'android_meta', 'android', 'anime_meta', 'anime', 'apple_meta', 'apple', 'arduino_meta', 'arduino', 'askubuntu_com_7z', 'astronomy_meta', 'astronomy', 'aviation_meta', 'aviation', 'avp_meta', 'avp', 'beer_meta', 'beer', 'bicycles_meta', 'bicycles', 'bioacoustics_meta', 'bioacoustics', 'bioinformatics_meta', 'bioinformatics', 'biology_meta', 'biology', 'bitcoin_meta', 'bitcoin', 'blender_meta', 'blender', 'boardgames_meta', 'boardgames', 'bricks_meta', 'bricks', 'buddhism_meta', 'buddhism', 'cardano_meta', 'cardano', 'chemistry_meta', 'chemistry', 'chess_meta', 'chess', 'chinese_meta', 'chinese', 'christianity_meta', 'christianity', 'civicrm_meta', 'civicrm', 'codegolf_meta', 'codegolf', 'codereview_meta', 'codereview', 'coffee_meta', 'coffee', 'cogsci_meta', 'cogsci', 'computergraphics_meta', 'computergraphics', 'conlang_meta', 'conlang', 'cooking_meta', 'cooking', 'craftcms_meta', 'craftcms', 'crafts_meta', 'crafts', 'crypto_meta', 'crypto', 'cs_meta', 'cs', 'cseducators_meta', 'cseducators', 'cstheory_meta', 'cstheory', 'datascience_meta', 'datascience', 'dba_meta', 'dba', 'devops_meta', 'devops', 'diy_meta', 'diy', 'drones_meta', 'drones', 'drupal_meta', 'drupal', 'dsp_meta', 'dsp', 'earthscience_meta', 'earthscience', 'ebooks_meta', 'ebooks', 'economics_meta', 'economics', 'electronics_meta', 'electronics', 'elementaryos_meta', 'elementaryos', 'ell_meta', 'ell', 'emacs_meta', 'emacs', 'engineering_meta', 'engineering', 'english_meta', 'english', 'eosio_meta', 'eosio', 'es_meta_stackoverflow_com_7z', 'es_stackoverflow_com_7z', 'esperanto_meta', 'esperanto', 'ethereum_meta', 'ethereum', 'expatriates_meta', 'expatriates', 'expressionengine_meta', 'expressionengine', 'fitness_meta', 'fitness', 'freelancing_meta', 'freelancing', 'french_meta', 'french', 'gamedev_meta', 'gamedev', 'gaming_meta', 'gaming', 'gardening_meta', 'gardening', 'genealogy_meta', 'genealogy', 'german_meta', 'german', 'gis_meta', 'gis', 'graphicdesign_meta', 'graphicdesign', 'ham_meta', 'ham', 'hardwarerecs_meta', 'hardwarerecs', 'health_meta', 'health', 'hermeneutics_meta', 'hermeneutics', 'hinduism_meta', 'hinduism', 'history_meta', 'history', 'homebrew_meta', 'homebrew', 'hsm_meta', 'hsm', 'interpersonal_meta', 'interpersonal', 'iot_meta', 'iot', 'iota_meta', 'iota', 'islam_meta', 'islam', 'italian_meta', 'italian', 'ja_meta_stackoverflow_com_7z', 'ja_stackoverflow_com_7z', 'japanese_meta', 'japanese', 'joomla_meta', 'joomla', 'judaism_meta', 'judaism', 'korean_meta', 'korean', 'languagelearning_meta', 'languagelearning', 'latin_meta', 'latin', 'law_meta', 'law', 'lifehacks_meta', 'lifehacks', 'linguistics_meta', 'linguistics', 'literature_meta', 'literature', 'magento_meta', 'magento', 'martialarts_meta', 'martialarts', 'materials_meta', 'materials', 'math_meta', 'math', 'matheducators_meta', 'matheducators', 'mathematica_meta', 'mathematica', 'mathoverflow_net_7z', 'mechanics_meta', 'mechanics', 'meta_askubuntu_com_7z', 'meta_mathoverflow_net_7z', 'meta_serverfault_com_7z', 'meta', 'meta_stackoverflow_com_7z', 'meta_superuser_com_7z', 'moderators_meta', 'moderators', 'monero_meta', 'monero', 'money_meta', 'money', 'movies_meta', 'movies', 'music_meta', 'music', 'musicfans_meta', 'musicfans', 'mythology_meta', 'mythology', 'networkengineering_meta', 'networkengineering', 'opendata_meta', 'opendata', 'opensource_meta', 'opensource', 'or_meta', 'or', 'outdoors_meta', 'outdoors', 'parenting_meta', 'parenting', 'patents_meta', 'patents', 'pets_meta', 'pets', 'philosophy_meta', 'philosophy', 'photo_meta', 'photo', 'physics_meta', 'physics', 'pm_meta', 'pm', 'poker_meta', 'poker', 'politics_meta', 'politics', 'portuguese_meta', 'portuguese', 'proofassistants_meta', 'proofassistants', 'pt_meta_stackoverflow_com_7z', 'pt_stackoverflow_com_7z', 'puzzling_meta', 'puzzling', 'quant_meta', 'quant', 'quantumcomputing_meta', 'quantumcomputing', 'raspberrypi_meta', 'raspberrypi', 'retrocomputing_meta', 'retrocomputing', 'reverseengineering_meta', 'reverseengineering', 'robotics_meta', 'robotics', 'rpg_meta', 'rpg', 'ru_meta_stackoverflow_com_7z', 'ru_stackoverflow_com_7z', 'rus_meta', 'rus', 'russian_meta', 'russian', 'salesforce_meta', 'salesforce', 'scicomp_meta', 'scicomp', 'scifi_meta', 'scifi', 'security_meta', 'security', 'serverfault_com_7z', 'sharepoint_meta', 'sharepoint', 'sitecore_meta', 'sitecore', 'skeptics_meta', 'skeptics', 'softwareengineering_meta', 'softwareengineering', 'softwarerecs_meta', 'softwarerecs', 'solana_meta', 'solana', 'sound_meta', 'sound', 'space_meta', 'space', 'spanish_meta', 'spanish', 'sports_meta', 'sports', 'sqa_meta', 'sqa', 'stackapps_com_7z', 'stackoverflow_com_Badges_7z', 'stackoverflow_com_Comments_7z', 'stackoverflow_com_PostHistory_7z', 'stackoverflow_com_PostLinks_7z', 'stackoverflow_com_Tags_7z', 'stackoverflow_com_Users_7z', 'stackoverflow_com_Votes_7z', 'stats_meta', 'stats', 'stellar_meta', 'stellar', 'substrate_meta', 'substrate', 'superuser_com_7z', 'sustainability_meta', 'sustainability', 'tex_meta', 'tex', 'tezos_meta', 'tezos', 'tor_meta', 'tor', 'travel_meta', 'travel', 'tridion_meta', 'tridion', 'ukrainian_meta', 'ukrainian', 'unix_meta', 'unix', 'ux_meta', 'ux', 'vegetarianism_meta', 'vegetarianism', 'vi_meta', 'vi', 'webapps_meta', 'webapps', 'webmasters_meta', 'webmasters', 'windowsphone_meta', 'windowsphone', 'woodworking_meta', 'woodworking', 'wordpress_meta', 'wordpress', 'workplace_meta', 'workplace', 'worldbuilding_meta', 'worldbuilding', 'writers_meta', 'writers'])\n",
-      "https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/ai.stackexchange.com.7z&file=Posts.xml\n"
-     ]
-    }
    ]
   },
   {
    "cell_type": "code",
-   "source": [
-    "xml_format_map = {\n",
-    "    \"Id\": int,\n",
-    "    \"PostTypeId\": int,\n",
-    "    \"CreationDate\": str,\n",
-    "    \"Score\": int,\n",
-    "    \"ViewCount\": int,\n",
-    "    \"Body\": str,\n",
-    "    \"AnswerCount\": int,\n",
-    "    \"CommentCount\": int,\n",
-    "    \"ContentLicense\": str,\n",
-    "    \"AcceptedAnswerId\": int,\n",
-    "    \"ParentId\": int,\n",
-    "}\n",
-    "\n",
-    "\n",
-    "# def extract_xml_file(file_url: str):\n",
-    "#   table = pd.read_xml(file_url)\n",
-    "#   return table\n",
-    "\n",
-    "\n",
-    "def xml_to_df(response: str):\n",
-    "    \"\"\"\n",
-    "    Collect and Manually import XML into Dataframe\n",
-    "\n",
-    "    pd.read_xml() errors when XML trees are too large, this is just a hack to\n",
-    "    download a XML file and parse into a Dataframe. **Not Tested on huge XML files**\n",
-    "\n",
-    "    Parameters:\n",
-    "    response (Requests.Response): Requests response object with the XML data\n",
-    "\n",
-    "    Returns:\n",
-    "    df (DataFrame): A Dataframe from the XML file\n",
-    "    \"\"\"\n",
-    "    soup = bs(response.content, \"xml\")\n",
-    "    posts = soup.find_all(\"row\")\n",
-    "\n",
-    "    all_posts = [post.attrs for post in posts]\n",
-    "\n",
-    "    df = pd.DataFrame(all_posts)\n",
-    "    df.AnswerCount.fillna(0, inplace=True)\n",
-    "    df.ViewCount.fillna(0, inplace=True)\n",
-    "    df.AcceptedAnswerId.fillna(0, inplace=True)\n",
-    "    df.ParentId.fillna(0, inplace=True)\n",
-    "    df[\"DataSource\"] = response.url\n",
-    "    df = df.astype(xml_format_map)\n",
-    "    return df\n",
-    "\n",
-    "\n",
-    "dataset_name = \"ai\"\n",
-    "\n",
-    "xml_posts_path = urls.get(dataset_name)\n",
-    "\n",
-    "\n",
-    "# df = extract_xml_file(test)\n",
-    "response = requests.get(xml_posts_path)\n",
-    "df = xml_to_df(response)\n",
-    "\n",
-    "\n",
-    "print(df.dtypes)\n",
-    "df.head()"
-   ],
+   "execution_count": null,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/",
@@ -165,11 +98,10 @@
     "id": "-t27RnxdzBYB",
     "outputId": "5ec0ceed-c82b-48fa-facd-41b4aae2f9e6"
    },
-   "execution_count": null,
    "outputs": [
     {
-     "output_type": "stream",
      "name": "stdout",
+     "output_type": "stream",
      "text": [
       "Id                        int64\n",
       "PostTypeId                int64\n",
@@ -198,53 +130,7 @@
      ]
     },
     {
-     "output_type": "execute_result",
      "data": {
-      "text/plain": [
-       "   Id  PostTypeId  AcceptedAnswerId             CreationDate  Score  \\\n",
-       "0   1           1                 3  2016-08-02T15:39:14.947     10   \n",
-       "1   2           1                 9  2016-08-02T15:40:20.623     14   \n",
-       "2   3           2                 0  2016-08-02T15:40:24.820     15   \n",
-       "3   4           1                12  2016-08-02T15:41:22.020     33   \n",
-       "4   6           1                20  2016-08-02T15:43:35.460      7   \n",
-       "\n",
-       "   ViewCount                                               Body OwnerUserId  \\\n",
-       "0        710  <p>What does \"backprop\" mean? Is the \"backprop...           8   \n",
-       "1       1008  <p>Does increasing the noise in data help to i...           8   \n",
-       "2          0  <p>\"Backprop\" is the same as \"backpropagation\"...           4   \n",
-       "3       1266  <p>When you're writing your algorithm, how do ...           8   \n",
-       "4        279  <p>Given the following definition of an intell...          29   \n",
-       "\n",
-       "  LastEditorUserId             LastEditDate  ... AnswerCount CommentCount  \\\n",
-       "0             2444  2019-11-16T17:56:22.093  ...           5            0   \n",
-       "1             2444  2019-02-23T22:36:19.090  ...           3            0   \n",
-       "2              NaN                      NaN  ...           0            0   \n",
-       "3             2444  2021-01-19T23:54:07.813  ...           4            0   \n",
-       "4             2444  2019-06-15T18:25:58.513  ...           2            0   \n",
-       "\n",
-       "  ContentLicense  ParentId  ClosedDate FavoriteCount  CommunityOwnedDate  \\\n",
-       "0   CC BY-SA 4.0         0         NaN           NaN                 NaN   \n",
-       "1   CC BY-SA 4.0         0         NaN           NaN                 NaN   \n",
-       "2   CC BY-SA 3.0         1         NaN           NaN                 NaN   \n",
-       "3   CC BY-SA 3.0         0         NaN           NaN                 NaN   \n",
-       "4   CC BY-SA 4.0         0         NaN           NaN                 NaN   \n",
-       "\n",
-       "  LastEditorDisplayName OwnerDisplayName  \\\n",
-       "0                   NaN              NaN   \n",
-       "1                   NaN              NaN   \n",
-       "2                   NaN              NaN   \n",
-       "3                   NaN              NaN   \n",
-       "4                   NaN              NaN   \n",
-       "\n",
-       "                                          DataSource  \n",
-       "0  https://ia600107.us.archive.org/view_archive.p...  \n",
-       "1  https://ia600107.us.archive.org/view_archive.p...  \n",
-       "2  https://ia600107.us.archive.org/view_archive.p...  \n",
-       "3  https://ia600107.us.archive.org/view_archive.p...  \n",
-       "4  https://ia600107.us.archive.org/view_archive.p...  \n",
-       "\n",
-       "[5 rows x 23 columns]"
-      ],
       "text/html": [
        "\n",
        "  <div id=\"df-776df830-b974-4f15-9190-53bcb8e84bf8\">\n",
@@ -491,99 +377,133 @@
        "    </div>\n",
        "  </div>\n",
        "  "
+      ],
+      "text/plain": [
+       "   Id  PostTypeId  AcceptedAnswerId             CreationDate  Score  \\\n",
+       "0   1           1                 3  2016-08-02T15:39:14.947     10   \n",
+       "1   2           1                 9  2016-08-02T15:40:20.623     14   \n",
+       "2   3           2                 0  2016-08-02T15:40:24.820     15   \n",
+       "3   4           1                12  2016-08-02T15:41:22.020     33   \n",
+       "4   6           1                20  2016-08-02T15:43:35.460      7   \n",
+       "\n",
+       "   ViewCount                                               Body OwnerUserId  \\\n",
+       "0        710  <p>What does \"backprop\" mean? Is the \"backprop...           8   \n",
+       "1       1008  <p>Does increasing the noise in data help to i...           8   \n",
+       "2          0  <p>\"Backprop\" is the same as \"backpropagation\"...           4   \n",
+       "3       1266  <p>When you're writing your algorithm, how do ...           8   \n",
+       "4        279  <p>Given the following definition of an intell...          29   \n",
+       "\n",
+       "  LastEditorUserId             LastEditDate  ... AnswerCount CommentCount  \\\n",
+       "0             2444  2019-11-16T17:56:22.093  ...           5            0   \n",
+       "1             2444  2019-02-23T22:36:19.090  ...           3            0   \n",
+       "2              NaN                      NaN  ...           0            0   \n",
+       "3             2444  2021-01-19T23:54:07.813  ...           4            0   \n",
+       "4             2444  2019-06-15T18:25:58.513  ...           2            0   \n",
+       "\n",
+       "  ContentLicense  ParentId  ClosedDate FavoriteCount  CommunityOwnedDate  \\\n",
+       "0   CC BY-SA 4.0         0         NaN           NaN                 NaN   \n",
+       "1   CC BY-SA 4.0         0         NaN           NaN                 NaN   \n",
+       "2   CC BY-SA 3.0         1         NaN           NaN                 NaN   \n",
+       "3   CC BY-SA 3.0         0         NaN           NaN                 NaN   \n",
+       "4   CC BY-SA 4.0         0         NaN           NaN                 NaN   \n",
+       "\n",
+       "  LastEditorDisplayName OwnerDisplayName  \\\n",
+       "0                   NaN              NaN   \n",
+       "1                   NaN              NaN   \n",
+       "2                   NaN              NaN   \n",
+       "3                   NaN              NaN   \n",
+       "4                   NaN              NaN   \n",
+       "\n",
+       "                                          DataSource  \n",
+       "0  https://ia600107.us.archive.org/view_archive.p...  \n",
+       "1  https://ia600107.us.archive.org/view_archive.p...  \n",
+       "2  https://ia600107.us.archive.org/view_archive.p...  \n",
+       "3  https://ia600107.us.archive.org/view_archive.p...  \n",
+       "4  https://ia600107.us.archive.org/view_archive.p...  \n",
+       "\n",
+       "[5 rows x 23 columns]"
       ]
      },
+     "execution_count": 219,
      "metadata": {},
-     "execution_count": 219
+     "output_type": "execute_result"
     }
+   ],
+   "source": [
+    "xml_format_map = {\n",
+    "    \"Id\": int,\n",
+    "    \"PostTypeId\": int,\n",
+    "    \"CreationDate\": str,\n",
+    "    \"Score\": int,\n",
+    "    \"ViewCount\": int,\n",
+    "    \"Body\": str,\n",
+    "    \"AnswerCount\": int,\n",
+    "    \"CommentCount\": int,\n",
+    "    \"ContentLicense\": str,\n",
+    "    \"AcceptedAnswerId\": int,\n",
+    "    \"ParentId\": int,\n",
+    "}\n",
+    "\n",
+    "\n",
+    "# def extract_xml_file(file_url: str):\n",
+    "#   table = pd.read_xml(file_url)\n",
+    "#   return table\n",
+    "\n",
+    "\n",
+    "def xml_to_df(response: str):\n",
+    "    \"\"\"\n",
+    "    Collect and Manually import XML into Dataframe\n",
+    "\n",
+    "    pd.read_xml() errors when XML trees are too large, this is just a hack to\n",
+    "    download a XML file and parse into a Dataframe. **Not Tested on huge XML files**\n",
+    "\n",
+    "    Parameters:\n",
+    "    response (Requests.Response): Requests response object with the XML data\n",
+    "\n",
+    "    Returns:\n",
+    "    df (DataFrame): A Dataframe from the XML file\n",
+    "    \"\"\"\n",
+    "    soup = bs(response.content, \"xml\")\n",
+    "    posts = soup.find_all(\"row\")\n",
+    "\n",
+    "    all_posts = [post.attrs for post in posts]\n",
+    "\n",
+    "    df = pd.DataFrame(all_posts)\n",
+    "    df.AnswerCount.fillna(0, inplace=True)\n",
+    "    df.ViewCount.fillna(0, inplace=True)\n",
+    "    df.AcceptedAnswerId.fillna(0, inplace=True)\n",
+    "    df.ParentId.fillna(0, inplace=True)\n",
+    "    df[\"DataSource\"] = response.url\n",
+    "    df = df.astype(xml_format_map)\n",
+    "    return df\n",
+    "\n",
+    "\n",
+    "dataset_name = \"ai\"\n",
+    "\n",
+    "xml_posts_path = urls.get(dataset_name)\n",
+    "\n",
+    "\n",
+    "# df = extract_xml_file(test)\n",
+    "response = requests.get(xml_posts_path)\n",
+    "df = xml_to_df(response)\n",
+    "\n",
+    "\n",
+    "print(df.dtypes)\n",
+    "df.head()"
    ]
   },
   {
    "cell_type": "markdown",
-   "source": [
-    "# Transformations"
-   ],
    "metadata": {
     "id": "RAzTR7zY3oan"
-   }
+   },
+   "source": [
+    "# Transformations"
+   ]
   },
   {
    "cell_type": "code",
-   "source": [
-    "def filter_only_questions_with_accepted_answers(df):\n",
-    "    \"\"\"**TODO**\n",
-    "    Filter only to Questions with Accepted Answers\n",
-    "\n",
-    "    Filter dataframe by questions that have accepted answers, should also include\n",
-    "    all rows of answers for those questions, even if not accepted.\n",
-    "\n",
-    "    Parameters:\n",
-    "    df (DataFrame): containing a \"AcceptedAnswerId\", \"Id\", and \"ParentId\" columns\n",
-    "\n",
-    "    Returns:\n",
-    "    df (DataFrame): current dataframe with filtered results\n",
-    "    \"\"\"\n",
-    "    df = df[(df[\"AcceptedAnswerId\"].notnull()) | (df[\"ParentId\"] == df[\"Id\"])]\n",
-    "\n",
-    "\n",
-    "def filter_scores_above(df, question_score_threshold: int = 20, answer_score_threshold: int = 20):\n",
-    "    \"\"\"**TODO**\n",
-    "    Filter Dataframe by minimum scores\n",
-    "\n",
-    "    Filter Question and Answer columns by score thresholds to trim lower scoring results\n",
-    "\n",
-    "    Parameters:\n",
-    "    df (DataFrame): containing a \"Score\" column\n",
-    "\n",
-    "    Returns:\n",
-    "    df (DataFrame): current dataframe with filtered results\n",
-    "    \"\"\"\n",
-    "    df = df[\n",
-    "        ((df[\"Score\"] >= question_score_threshold) & (df.PostTypeId == 1))\n",
-    "        | ((df[\"Score\"] >= answer_score_threshold) & (df.PostTypeId == 2))\n",
-    "    ]\n",
-    "\n",
-    "\n",
-    "def convert_html_to_text(df, column: str = \"Body\"):\n",
-    "    \"\"\"\n",
-    "    Convert HTML tags to pure text\n",
-    "\n",
-    "    Feeds HTML text body into BeautifulSoup to parse it to only text. Set aside as\n",
-    "    function to provide option to skip\n",
-    "\n",
-    "    Parameters:\n",
-    "    df (DataFrame): containing a \"Body\" column with HTML\n",
-    "\n",
-    "    Returns:\n",
-    "    df (DataFrame): current dataframe with parsed column\n",
-    "    \"\"\"\n",
-    "    df.dropna(subset=[column], inplace=True)\n",
-    "    df[f\"{column}Clean\"] = df[column].apply(lambda row: bs(row, \"html.parser\").text)\n",
-    "\n",
-    "\n",
-    "def clean_tags(df):\n",
-    "    \"\"\"\n",
-    "    Convert Tags into Comma separated\n",
-    "\n",
-    "    Converts Tag slugs into commas separated tags\n",
-    "\n",
-    "    Parameters:\n",
-    "    df (DataFrame): containing a \"Tags\" column with slugs\n",
-    "\n",
-    "    Returns:\n",
-    "    df (DataFrame): current dataframe with parsed column\n",
-    "    \"\"\"\n",
-    "    df[\"TagsClean\"] = df[\"Tags\"].str.replace(\"-\", \" \").str.replace(\"><\", \", \").str.replace(\"<\", \"\").str.replace(\">\", \"\")\n",
-    "\n",
-    "\n",
-    "# filter_only_questions_with_accepted_answers(df)\n",
-    "# filter_scores_above(df)\n",
-    "convert_html_to_text(df)\n",
-    "clean_tags(df)\n",
-    "\n",
-    "df[[\"Body\", \"BodyClean\", \"Tags\", \"TagsClean\"]]\n",
-    "# print(df.shape)"
-   ],
+   "execution_count": null,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/",
@@ -592,66 +512,9 @@
     "id": "qyUqc31Z3Z9g",
     "outputId": "18dce8b4-af26-49c9-ee73-6c677177b516"
    },
-   "execution_count": null,
    "outputs": [
     {
-     "output_type": "execute_result",
      "data": {
-      "text/plain": [
-       "                                                    Body  \\\n",
-       "0      <p>What does \"backprop\" mean? Is the \"backprop...   \n",
-       "1      <p>Does increasing the noise in data help to i...   \n",
-       "2      <p>\"Backprop\" is the same as \"backpropagation\"...   \n",
-       "3      <p>When you're writing your algorithm, how do ...   \n",
-       "4      <p>Given the following definition of an intell...   \n",
-       "...                                                  ...   \n",
-       "23174  <p>The purpose of evaluating the state and act...   \n",
-       "23175  <p>In machine translation, convolution is a te...   \n",
-       "23176  <p>One of the key features of ChatGPT is its a...   \n",
-       "23177  <p>Given a neural network model for Covid-19 c...   \n",
-       "23178  <p>My question is more related to the fundamen...   \n",
-       "\n",
-       "                                               BodyClean  \\\n",
-       "0      What does \"backprop\" mean? Is the \"backprop\" t...   \n",
-       "1      Does increasing the noise in data help to impr...   \n",
-       "2      \"Backprop\" is the same as \"backpropagation\": i...   \n",
-       "3      When you're writing your algorithm, how do you...   \n",
-       "4      Given the following definition of an intellige...   \n",
-       "...                                                  ...   \n",
-       "23174  The purpose of evaluating the state and action...   \n",
-       "23175  In machine translation, convolution is a techn...   \n",
-       "23176  One of the key features of ChatGPT is its abil...   \n",
-       "23177  Given a neural network model for Covid-19 clas...   \n",
-       "23178  My question is more related to the fundamental...   \n",
-       "\n",
-       "                                                    Tags  \\\n",
-       "0      <neural-networks><backpropagation><terminology...   \n",
-       "1      <neural-networks><machine-learning><statistica...   \n",
-       "2                                                    NaN   \n",
-       "3      <neural-networks><hyperparameter-optimization>...   \n",
-       "4           <philosophy><definitions><intelligent-agent>   \n",
-       "...                                                  ...   \n",
-       "23174                                                NaN   \n",
-       "23175                                                NaN   \n",
-       "23176                                                NaN   \n",
-       "23177                        <neural-networks><homework>   \n",
-       "23178         <search><constraint-satisfaction-problems>   \n",
-       "\n",
-       "                                               TagsClean  \n",
-       "0      neural networks, backpropagation, terminology,...  \n",
-       "1      neural networks, machine learning, statistical...  \n",
-       "2                                                    NaN  \n",
-       "3      neural networks, hyperparameter optimization, ...  \n",
-       "4             philosophy, definitions, intelligent agent  \n",
-       "...                                                  ...  \n",
-       "23174                                                NaN  \n",
-       "23175                                                NaN  \n",
-       "23176                                                NaN  \n",
-       "23177                          neural networks, homework  \n",
-       "23178           search, constraint satisfaction problems  \n",
-       "\n",
-       "[23179 rows x 4 columns]"
-      ],
       "text/html": [
        "\n",
        "  <div id=\"df-c809ce1f-6807-4dfd-97c9-38d47afa28d7\">\n",
@@ -838,75 +701,156 @@
        "    </div>\n",
        "  </div>\n",
        "  "
+      ],
+      "text/plain": [
+       "                                                    Body  \\\n",
+       "0      <p>What does \"backprop\" mean? Is the \"backprop...   \n",
+       "1      <p>Does increasing the noise in data help to i...   \n",
+       "2      <p>\"Backprop\" is the same as \"backpropagation\"...   \n",
+       "3      <p>When you're writing your algorithm, how do ...   \n",
+       "4      <p>Given the following definition of an intell...   \n",
+       "...                                                  ...   \n",
+       "23174  <p>The purpose of evaluating the state and act...   \n",
+       "23175  <p>In machine translation, convolution is a te...   \n",
+       "23176  <p>One of the key features of ChatGPT is its a...   \n",
+       "23177  <p>Given a neural network model for Covid-19 c...   \n",
+       "23178  <p>My question is more related to the fundamen...   \n",
+       "\n",
+       "                                               BodyClean  \\\n",
+       "0      What does \"backprop\" mean? Is the \"backprop\" t...   \n",
+       "1      Does increasing the noise in data help to impr...   \n",
+       "2      \"Backprop\" is the same as \"backpropagation\": i...   \n",
+       "3      When you're writing your algorithm, how do you...   \n",
+       "4      Given the following definition of an intellige...   \n",
+       "...                                                  ...   \n",
+       "23174  The purpose of evaluating the state and action...   \n",
+       "23175  In machine translation, convolution is a techn...   \n",
+       "23176  One of the key features of ChatGPT is its abil...   \n",
+       "23177  Given a neural network model for Covid-19 clas...   \n",
+       "23178  My question is more related to the fundamental...   \n",
+       "\n",
+       "                                                    Tags  \\\n",
+       "0      <neural-networks><backpropagation><terminology...   \n",
+       "1      <neural-networks><machine-learning><statistica...   \n",
+       "2                                                    NaN   \n",
+       "3      <neural-networks><hyperparameter-optimization>...   \n",
+       "4           <philosophy><definitions><intelligent-agent>   \n",
+       "...                                                  ...   \n",
+       "23174                                                NaN   \n",
+       "23175                                                NaN   \n",
+       "23176                                                NaN   \n",
+       "23177                        <neural-networks><homework>   \n",
+       "23178         <search><constraint-satisfaction-problems>   \n",
+       "\n",
+       "                                               TagsClean  \n",
+       "0      neural networks, backpropagation, terminology,...  \n",
+       "1      neural networks, machine learning, statistical...  \n",
+       "2                                                    NaN  \n",
+       "3      neural networks, hyperparameter optimization, ...  \n",
+       "4             philosophy, definitions, intelligent agent  \n",
+       "...                                                  ...  \n",
+       "23174                                                NaN  \n",
+       "23175                                                NaN  \n",
+       "23176                                                NaN  \n",
+       "23177                          neural networks, homework  \n",
+       "23178           search, constraint satisfaction problems  \n",
+       "\n",
+       "[23179 rows x 4 columns]"
       ]
      },
+     "execution_count": 220,
      "metadata": {},
-     "execution_count": 220
+     "output_type": "execute_result"
     }
+   ],
+   "source": [
+    "def filter_only_questions_with_accepted_answers(df):\n",
+    "    \"\"\"**TODO**\n",
+    "    Filter only to Questions with Accepted Answers\n",
+    "\n",
+    "    Filter dataframe by questions that have accepted answers, should also include\n",
+    "    all rows of answers for those questions, even if not accepted.\n",
+    "\n",
+    "    Parameters:\n",
+    "    df (DataFrame): containing a \"AcceptedAnswerId\", \"Id\", and \"ParentId\" columns\n",
+    "\n",
+    "    Returns:\n",
+    "    df (DataFrame): current dataframe with filtered results\n",
+    "    \"\"\"\n",
+    "    df = df[(df[\"AcceptedAnswerId\"].notnull()) | (df[\"ParentId\"] == df[\"Id\"])]\n",
+    "\n",
+    "\n",
+    "def filter_scores_above(df, question_score_threshold: int = 20, answer_score_threshold: int = 20):\n",
+    "    \"\"\"**TODO**\n",
+    "    Filter Dataframe by minimum scores\n",
+    "\n",
+    "    Filter Question and Answer columns by score thresholds to trim lower scoring results\n",
+    "\n",
+    "    Parameters:\n",
+    "    df (DataFrame): containing a \"Score\" column\n",
+    "\n",
+    "    Returns:\n",
+    "    df (DataFrame): current dataframe with filtered results\n",
+    "    \"\"\"\n",
+    "    df = df[\n",
+    "        ((df[\"Score\"] >= question_score_threshold) & (df.PostTypeId == 1))\n",
+    "        | ((df[\"Score\"] >= answer_score_threshold) & (df.PostTypeId == 2))\n",
+    "    ]\n",
+    "\n",
+    "\n",
+    "def convert_html_to_text(df, column: str = \"Body\"):\n",
+    "    \"\"\"\n",
+    "    Convert HTML tags to pure text\n",
+    "\n",
+    "    Feeds HTML text body into BeautifulSoup to parse it to only text. Set aside as\n",
+    "    function to provide option to skip\n",
+    "\n",
+    "    Parameters:\n",
+    "    df (DataFrame): containing a \"Body\" column with HTML\n",
+    "\n",
+    "    Returns:\n",
+    "    df (DataFrame): current dataframe with parsed column\n",
+    "    \"\"\"\n",
+    "    df.dropna(subset=[column], inplace=True)\n",
+    "    df[f\"{column}Clean\"] = df[column].apply(lambda row: bs(row, \"html.parser\").text)\n",
+    "\n",
+    "\n",
+    "def clean_tags(df):\n",
+    "    \"\"\"\n",
+    "    Convert Tags into Comma separated\n",
+    "\n",
+    "    Converts Tag slugs into commas separated tags\n",
+    "\n",
+    "    Parameters:\n",
+    "    df (DataFrame): containing a \"Tags\" column with slugs\n",
+    "\n",
+    "    Returns:\n",
+    "    df (DataFrame): current dataframe with parsed column\n",
+    "    \"\"\"\n",
+    "    df[\"TagsClean\"] = df[\"Tags\"].str.replace(\"-\", \" \").str.replace(\"><\", \", \").str.replace(\"<\", \"\").str.replace(\">\", \"\")\n",
+    "\n",
+    "\n",
+    "# filter_only_questions_with_accepted_answers(df)\n",
+    "# filter_scores_above(df)\n",
+    "convert_html_to_text(df)\n",
+    "clean_tags(df)\n",
+    "\n",
+    "df[[\"Body\", \"BodyClean\", \"Tags\", \"TagsClean\"]]\n",
+    "# print(df.shape)"
    ]
   },
   {
    "cell_type": "markdown",
-   "source": [
-    "This groups questions with answers so that a row with a question also has a column with an answer. It then creates an AcceptedAnswerFlag column that is True if the answer was accepted by the person who asked the question. Changing the `number_of_results` variable will limit the number of answers you want to keep."
-   ],
    "metadata": {
     "id": "C09Bwdw-44PZ"
-   }
+   },
+   "source": [
+    "This groups questions with answers so that a row with a question also has a column with an answer. It then creates an AcceptedAnswerFlag column that is True if the answer was accepted by the person who asked the question. Changing the `number_of_results` variable will limit the number of answers you want to keep."
+   ]
   },
   {
    "cell_type": "code",
-   "source": [
-    "questions = df[df.PostTypeId == 1]\n",
-    "answers = df[df.PostTypeId == 2]\n",
-    "\n",
-    "df = pd.merge(\n",
-    "    questions,\n",
-    "    answers[\n",
-    "        [\n",
-    "            \"Id\",\n",
-    "            \"CreationDate\",\n",
-    "            \"Score\",\n",
-    "            \"ViewCount\",\n",
-    "            \"CommentCount\",\n",
-    "            \"ContentLicense\",\n",
-    "            \"TagsClean\",\n",
-    "            \"BodyClean\",\n",
-    "            \"ParentId\",\n",
-    "        ]\n",
-    "    ],\n",
-    "    left_on=\"Id\",\n",
-    "    right_on=\"ParentId\",\n",
-    "    suffixes=(\"_q\", \"_a\"),\n",
-    "    how=\"left\",\n",
-    ")\n",
-    "\n",
-    "df[\"AcceptedAnswerFlag\"] = df.apply(lambda row: row[\"Id_a\"] == row[\"AcceptedAnswerId\"], axis=1)\n",
-    "\n",
-    "df = df.rename(\n",
-    "    columns={\n",
-    "        \"BodyClean_q\": \"Question\",\n",
-    "        \"Score_q\": \"QuestionScore\",\n",
-    "        \"TagsClean_q\": \"QuestionTags\",\n",
-    "        \"BodyClean_a\": \"Answer\",\n",
-    "        \"Score_a\": \"AnswerScore\",\n",
-    "        \"ContentLicense_q\": \"QuestionContentLicense\",\n",
-    "        \"ContentLicense_a\": \"AnswerContentLicense\",\n",
-    "        \"CreationDate_q\": \"CreationDate\",\n",
-    "    }\n",
-    ")\n",
-    "\n",
-    "## Set the number of results to a lower number to only return top N rated Answers.\n",
-    "number_of_results = 25\n",
-    "df = (\n",
-    "    df.sort_values(by=[\"AcceptedAnswerFlag\", \"AnswerScore\"], ascending=[False, False])\n",
-    "    .groupby(\"Question\")\n",
-    "    .head(number_of_results)\n",
-    "    .reset_index(drop=True)\n",
-    ")\n",
-    "\n",
-    "df[[\"Id_q\", \"Question\", \"QuestionScore\", \"QuestionTags\", \"Id_a\", \"Answer\", \"AnswerScore\", \"AcceptedAnswerFlag\"]].head()"
-   ],
+   "execution_count": null,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/",
@@ -915,40 +859,9 @@
     "id": "Bgz2fZ9k43Ab",
     "outputId": "28896d69-03cd-4877-fdfb-ae48dafa4ff3"
    },
-   "execution_count": null,
    "outputs": [
     {
-     "output_type": "execute_result",
      "data": {
-      "text/plain": [
-       "    Id_q                                           Question  QuestionScore  \\\n",
-       "0   1768  In Portal 2 we see that AI's can be \"killed\" b...            175   \n",
-       "1  10623  What is self-supervised learning in machine le...             91   \n",
-       "2    111  Obviously, self-driving cars aren't perfect, s...            100   \n",
-       "3  14224  If the original purpose for developing AI was ...             69   \n",
-       "4   1479  Do scientists or research experts know from th...             94   \n",
-       "\n",
-       "                                        QuestionTags     Id_a  \\\n",
-       "0  philosophy, decision theory, mythology of ai, ...   1769.0   \n",
-       "1  machine learning, comparison, supervised learn...  10624.0   \n",
-       "2  philosophy, ethics, autonomous vehicles, decis...   1790.0   \n",
-       "3                 philosophy, social, explainable ai  14247.0   \n",
-       "4  neural networks, deep learning, convolutional ...   4044.0   \n",
-       "\n",
-       "                                              Answer  AnswerScore  \\\n",
-       "0  This classic problem exhibits a basic misunder...        146.0   \n",
-       "1  Introduction\\nThe term self-supervised learnin...         90.0   \n",
-       "2  \\nHow could self-driving cars make ethical dec...         76.0   \n",
-       "3  As argued by Selvaraju et al., there are three...         75.0   \n",
-       "4  There are many approaches that aim to make a t...         69.0   \n",
-       "\n",
-       "   AcceptedAnswerFlag  \n",
-       "0                True  \n",
-       "1                True  \n",
-       "2                True  \n",
-       "3                True  \n",
-       "4                True  "
-      ],
       "text/html": [
        "\n",
        "  <div id=\"df-8ac2298f-ac6d-46f5-aa1d-41dec7fe27b5\">\n",
@@ -1116,22 +1029,97 @@
        "    </div>\n",
        "  </div>\n",
        "  "
+      ],
+      "text/plain": [
+       "    Id_q                                           Question  QuestionScore  \\\n",
+       "0   1768  In Portal 2 we see that AI's can be \"killed\" b...            175   \n",
+       "1  10623  What is self-supervised learning in machine le...             91   \n",
+       "2    111  Obviously, self-driving cars aren't perfect, s...            100   \n",
+       "3  14224  If the original purpose for developing AI was ...             69   \n",
+       "4   1479  Do scientists or research experts know from th...             94   \n",
+       "\n",
+       "                                        QuestionTags     Id_a  \\\n",
+       "0  philosophy, decision theory, mythology of ai, ...   1769.0   \n",
+       "1  machine learning, comparison, supervised learn...  10624.0   \n",
+       "2  philosophy, ethics, autonomous vehicles, decis...   1790.0   \n",
+       "3                 philosophy, social, explainable ai  14247.0   \n",
+       "4  neural networks, deep learning, convolutional ...   4044.0   \n",
+       "\n",
+       "                                              Answer  AnswerScore  \\\n",
+       "0  This classic problem exhibits a basic misunder...        146.0   \n",
+       "1  Introduction\\nThe term self-supervised learnin...         90.0   \n",
+       "2  \\nHow could self-driving cars make ethical dec...         76.0   \n",
+       "3  As argued by Selvaraju et al., there are three...         75.0   \n",
+       "4  There are many approaches that aim to make a t...         69.0   \n",
+       "\n",
+       "   AcceptedAnswerFlag  \n",
+       "0                True  \n",
+       "1                True  \n",
+       "2                True  \n",
+       "3                True  \n",
+       "4                True  "
       ]
      },
+     "execution_count": 221,
      "metadata": {},
-     "execution_count": 221
+     "output_type": "execute_result"
     }
+   ],
+   "source": [
+    "questions = df[df.PostTypeId == 1]\n",
+    "answers = df[df.PostTypeId == 2]\n",
+    "\n",
+    "df = pd.merge(\n",
+    "    questions,\n",
+    "    answers[\n",
+    "        [\n",
+    "            \"Id\",\n",
+    "            \"CreationDate\",\n",
+    "            \"Score\",\n",
+    "            \"ViewCount\",\n",
+    "            \"CommentCount\",\n",
+    "            \"ContentLicense\",\n",
+    "            \"TagsClean\",\n",
+    "            \"BodyClean\",\n",
+    "            \"ParentId\",\n",
+    "        ]\n",
+    "    ],\n",
+    "    left_on=\"Id\",\n",
+    "    right_on=\"ParentId\",\n",
+    "    suffixes=(\"_q\", \"_a\"),\n",
+    "    how=\"left\",\n",
+    ")\n",
+    "\n",
+    "df[\"AcceptedAnswerFlag\"] = df.apply(lambda row: row[\"Id_a\"] == row[\"AcceptedAnswerId\"], axis=1)\n",
+    "\n",
+    "df = df.rename(\n",
+    "    columns={\n",
+    "        \"BodyClean_q\": \"Question\",\n",
+    "        \"Score_q\": \"QuestionScore\",\n",
+    "        \"TagsClean_q\": \"QuestionTags\",\n",
+    "        \"BodyClean_a\": \"Answer\",\n",
+    "        \"Score_a\": \"AnswerScore\",\n",
+    "        \"ContentLicense_q\": \"QuestionContentLicense\",\n",
+    "        \"ContentLicense_a\": \"AnswerContentLicense\",\n",
+    "        \"CreationDate_q\": \"CreationDate\",\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "## Set the number of results to a lower number to only return top N rated Answers.\n",
+    "number_of_results = 25\n",
+    "df = (\n",
+    "    df.sort_values(by=[\"AcceptedAnswerFlag\", \"AnswerScore\"], ascending=[False, False])\n",
+    "    .groupby(\"Question\")\n",
+    "    .head(number_of_results)\n",
+    "    .reset_index(drop=True)\n",
+    ")\n",
+    "\n",
+    "df[[\"Id_q\", \"Question\", \"QuestionScore\", \"QuestionTags\", \"Id_a\", \"Answer\", \"AnswerScore\", \"AcceptedAnswerFlag\"]].head()"
    ]
   },
   {
    "cell_type": "code",
-   "source": [
-    "testing_id = df.Id_q.mode()[0]\n",
-    "df[(df.Id_q == testing_id) | (df.ParentId_a == testing_id)][\n",
-    "    [\"Id_q\", \"Question\", \"ParentId_a\", \"AcceptedAnswerId\", \"Id_a\", \"Answer\", \"AnswerScore\", \"AcceptedAnswerFlag\"]\n",
-    "]\n",
-    "# df[['Id_q', 'Question', 'ParentId_a', 'AcceptedAnswerId', 'Id_a', 'Answer', 'AnswerScore', 'AcceptedAnswerFlag']]"
-   ],
+   "execution_count": null,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/",
@@ -1140,96 +1128,9 @@
     "id": "eds1K8WL9QPo",
     "outputId": "bc526503-d6dd-428f-fa98-ad419d26a7dc"
    },
-   "execution_count": null,
    "outputs": [
     {
-     "output_type": "execute_result",
      "data": {
-      "text/plain": [
-       "       Id_q                                           Question  ParentId_a  \\\n",
-       "7     15730  As a human being, we can think infinity. In pr...     15730.0   \n",
-       "3662  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
-       "3713  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
-       "3788  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
-       "3821  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
-       "3882  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
-       "4389  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
-       "4849  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
-       "4850  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
-       "5763  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
-       "5764  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
-       "5765  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
-       "7462  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
-       "7463  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
-       "7464  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
-       "7465  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
-       "7466  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
-       "7467  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
-       "9481  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
-       "\n",
-       "      AcceptedAnswerId     Id_a  \\\n",
-       "7                15744  15744.0   \n",
-       "3662             15744  15753.0   \n",
-       "3713             15744  15747.0   \n",
-       "3788             15744  15756.0   \n",
-       "3821             15744  15758.0   \n",
-       "3882             15744  15762.0   \n",
-       "4389             15744  15783.0   \n",
-       "4849             15744  15740.0   \n",
-       "4850             15744  15803.0   \n",
-       "5763             15744  15768.0   \n",
-       "5764             15744  15810.0   \n",
-       "5765             15744  15943.0   \n",
-       "7462             15744  15779.0   \n",
-       "7463             15744  15787.0   \n",
-       "7464             15744  15801.0   \n",
-       "7465             15744  15930.0   \n",
-       "7466             15744  15934.0   \n",
-       "7467             15744  15938.0   \n",
-       "9481             15744  15931.0   \n",
-       "\n",
-       "                                                 Answer  AnswerScore  \\\n",
-       "7     I think this is a fairly common misconception ...         62.0   \n",
-       "3662  I think your premise is flawed.\\nYou seem to a...         19.0   \n",
-       "3713  TL;DR: The subtleties of infinity are made app...         12.0   \n",
-       "3788  In Haskell, you can type:\\nprint [1..]\\nand it...          9.0   \n",
-       "3821  I believe humans can be said to understand inf...          8.0   \n",
-       "3882  (There's a summary at the bottom for those who...          7.0   \n",
-       "4389  Then premise assumes that humans \"understand\" ...          4.0   \n",
-       "4849  By adding some rules for infinity in arithmeti...          3.0   \n",
-       "4850  I think the concept that is missing in the dis...          3.0   \n",
-       "5763  Computers don't understand \"infinity\" or even ...          2.0   \n",
-       "5764  The Questions That Computers Can Never Answer ...          2.0   \n",
-       "5765  John Doucette's answer covers my thoughts on t...          2.0   \n",
-       "7462  I would think that a computer couldn’t underst...          1.0   \n",
-       "7463  The \"concept\" of infinity is 1 thing to unders...          1.0   \n",
-       "7464  Just food for thought: how about if we try to ...          1.0   \n",
-       "7465  Its arguable if we humans understand infinity....          1.0   \n",
-       "7466  Well -- just to touch on the question of peopl...          1.0   \n",
-       "7467  Humans certainly don't understand infinity. Cu...          1.0   \n",
-       "9481  I think the property humans have which compute...          0.0   \n",
-       "\n",
-       "      AcceptedAnswerFlag  \n",
-       "7                   True  \n",
-       "3662               False  \n",
-       "3713               False  \n",
-       "3788               False  \n",
-       "3821               False  \n",
-       "3882               False  \n",
-       "4389               False  \n",
-       "4849               False  \n",
-       "4850               False  \n",
-       "5763               False  \n",
-       "5764               False  \n",
-       "5765               False  \n",
-       "7462               False  \n",
-       "7463               False  \n",
-       "7464               False  \n",
-       "7465               False  \n",
-       "7466               False  \n",
-       "7467               False  \n",
-       "9481               False  "
-      ],
       "text/html": [
        "\n",
        "  <div id=\"df-16d171db-e359-46f3-a969-510a35cee78f\">\n",
@@ -1551,43 +1452,119 @@
        "    </div>\n",
        "  </div>\n",
        "  "
+      ],
+      "text/plain": [
+       "       Id_q                                           Question  ParentId_a  \\\n",
+       "7     15730  As a human being, we can think infinity. In pr...     15730.0   \n",
+       "3662  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
+       "3713  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
+       "3788  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
+       "3821  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
+       "3882  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
+       "4389  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
+       "4849  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
+       "4850  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
+       "5763  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
+       "5764  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
+       "5765  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
+       "7462  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
+       "7463  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
+       "7464  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
+       "7465  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
+       "7466  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
+       "7467  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
+       "9481  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
+       "\n",
+       "      AcceptedAnswerId     Id_a  \\\n",
+       "7                15744  15744.0   \n",
+       "3662             15744  15753.0   \n",
+       "3713             15744  15747.0   \n",
+       "3788             15744  15756.0   \n",
+       "3821             15744  15758.0   \n",
+       "3882             15744  15762.0   \n",
+       "4389             15744  15783.0   \n",
+       "4849             15744  15740.0   \n",
+       "4850             15744  15803.0   \n",
+       "5763             15744  15768.0   \n",
+       "5764             15744  15810.0   \n",
+       "5765             15744  15943.0   \n",
+       "7462             15744  15779.0   \n",
+       "7463             15744  15787.0   \n",
+       "7464             15744  15801.0   \n",
+       "7465             15744  15930.0   \n",
+       "7466             15744  15934.0   \n",
+       "7467             15744  15938.0   \n",
+       "9481             15744  15931.0   \n",
+       "\n",
+       "                                                 Answer  AnswerScore  \\\n",
+       "7     I think this is a fairly common misconception ...         62.0   \n",
+       "3662  I think your premise is flawed.\\nYou seem to a...         19.0   \n",
+       "3713  TL;DR: The subtleties of infinity are made app...         12.0   \n",
+       "3788  In Haskell, you can type:\\nprint [1..]\\nand it...          9.0   \n",
+       "3821  I believe humans can be said to understand inf...          8.0   \n",
+       "3882  (There's a summary at the bottom for those who...          7.0   \n",
+       "4389  Then premise assumes that humans \"understand\" ...          4.0   \n",
+       "4849  By adding some rules for infinity in arithmeti...          3.0   \n",
+       "4850  I think the concept that is missing in the dis...          3.0   \n",
+       "5763  Computers don't understand \"infinity\" or even ...          2.0   \n",
+       "5764  The Questions That Computers Can Never Answer ...          2.0   \n",
+       "5765  John Doucette's answer covers my thoughts on t...          2.0   \n",
+       "7462  I would think that a computer couldn’t underst...          1.0   \n",
+       "7463  The \"concept\" of infinity is 1 thing to unders...          1.0   \n",
+       "7464  Just food for thought: how about if we try to ...          1.0   \n",
+       "7465  Its arguable if we humans understand infinity....          1.0   \n",
+       "7466  Well -- just to touch on the question of peopl...          1.0   \n",
+       "7467  Humans certainly don't understand infinity. Cu...          1.0   \n",
+       "9481  I think the property humans have which compute...          0.0   \n",
+       "\n",
+       "      AcceptedAnswerFlag  \n",
+       "7                   True  \n",
+       "3662               False  \n",
+       "3713               False  \n",
+       "3788               False  \n",
+       "3821               False  \n",
+       "3882               False  \n",
+       "4389               False  \n",
+       "4849               False  \n",
+       "4850               False  \n",
+       "5763               False  \n",
+       "5764               False  \n",
+       "5765               False  \n",
+       "7462               False  \n",
+       "7463               False  \n",
+       "7464               False  \n",
+       "7465               False  \n",
+       "7466               False  \n",
+       "7467               False  \n",
+       "9481               False  "
       ]
      },
+     "execution_count": 222,
      "metadata": {},
-     "execution_count": 222
+     "output_type": "execute_result"
     }
+   ],
+   "source": [
+    "testing_id = df.Id_q.mode()[0]\n",
+    "df[(df.Id_q == testing_id) | (df.ParentId_a == testing_id)][\n",
+    "    [\"Id_q\", \"Question\", \"ParentId_a\", \"AcceptedAnswerId\", \"Id_a\", \"Answer\", \"AnswerScore\", \"AcceptedAnswerFlag\"]\n",
+    "]\n",
+    "# df[['Id_q', 'Question', 'ParentId_a', 'AcceptedAnswerId', 'Id_a', 'Answer', 'AnswerScore', 'AcceptedAnswerFlag']]"
    ]
   },
   {
    "cell_type": "markdown",
+   "metadata": {
+    "id": "gXgpXEO7DCbj"
+   },
    "source": [
     "# Create JSONL version of Dataframe\n",
     "This groups the dataframe by question data and creates nested list of Answers for that group. The entire list contains individual JSON objects, each representing a single question in the dataset with a key, Answers, which contains a list of dictionaries for each answer to the question."
-   ],
-   "metadata": {
-    "id": "gXgpXEO7DCbj"
-   }
+   ]
   },
   {
    "cell_type": "code",
-   "source": [
-    "j = (\n",
-    "    df.groupby(\n",
-    "        [\"Title\", \"Question\", \"QuestionScore\", \"QuestionTags\", \"QuestionContentLicense\", \"DataSource\", \"CreationDate\"]\n",
-    "    )\n",
-    "    .apply(lambda x: x[[\"Answer\", \"AnswerScore\", \"AcceptedAnswerFlag\"]].to_dict(\"records\"))\n",
-    "    .reset_index()\n",
-    "    .rename(columns={0: \"Answers\"})\n",
-    "    .to_json(orient=\"records\")\n",
-    ")\n",
-    "\n",
-    "data = json.loads(j)\n",
-    "\n",
-    "for post in data:\n",
-    "    if len(post.get(\"Answers\")) >= 4:\n",
-    "        print(json.dumps(post, indent=4))\n",
-    "        break"
-   ],
+   "execution_count": null,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
@@ -1595,11 +1572,10 @@
     "id": "OBR58MSRzAMP",
     "outputId": "c7da1e6c-3a97-465d-c9ba-7e055cb0d751"
    },
-   "execution_count": null,
    "outputs": [
     {
-     "output_type": "stream",
      "name": "stdout",
+     "output_type": "stream",
      "text": [
       "{\n",
       "    \"Title\": \"1 hidden layer with 1000 neurons vs. 10 hidden layers with 100 neurons\",\n",
@@ -1634,21 +1610,56 @@
       "}\n"
      ]
     }
+   ],
+   "source": [
+    "j = (\n",
+    "    df.groupby(\n",
+    "        [\"Title\", \"Question\", \"QuestionScore\", \"QuestionTags\", \"QuestionContentLicense\", \"DataSource\", \"CreationDate\"]\n",
+    "    )\n",
+    "    .apply(lambda x: x[[\"Answer\", \"AnswerScore\", \"AcceptedAnswerFlag\"]].to_dict(\"records\"))\n",
+    "    .reset_index()\n",
+    "    .rename(columns={0: \"Answers\"})\n",
+    "    .to_json(orient=\"records\")\n",
+    ")\n",
+    "\n",
+    "data = json.loads(j)\n",
+    "\n",
+    "for post in data:\n",
+    "    if len(post.get(\"Answers\")) >= 4:\n",
+    "        print(json.dumps(post, indent=4))\n",
+    "        break"
    ]
   },
   {
    "cell_type": "markdown",
+   "metadata": {
+    "id": "PlNjrpXaDm1_"
+   },
    "source": [
     "# Save file\n",
     "\n",
     "Files can be saved as JSON, JSONL, CSV, or Parquet"
-   ],
-   "metadata": {
-    "id": "PlNjrpXaDm1_"
-   }
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "CU0gWRGQDqIs",
+    "outputId": "9646e475-cedd-46f1-f9b8-7eb1fbc703c7"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Data should be either of List type for JSON and JSONL, or Pandas Dataframes for CSV and Parquet\n"
+     ]
+    }
+   ],
    "source": [
     "file_name = dataset_name\n",
     "\n",
@@ -1685,40 +1696,28 @@
     "\n",
     "# save_data(data=data, file_name=file_name, file_type='jsonl')\n",
     "# save_data(data=df, file_name=file_name, file_type='parquet')"
-   ],
-   "metadata": {
-    "id": "CU0gWRGQDqIs",
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "outputId": "9646e475-cedd-46f1-f9b8-7eb1fbc703c7"
-   },
-   "execution_count": null,
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "Data should be either of List type for JSON and JSONL, or Pandas Dataframes for CSV and Parquet\n"
-     ]
-    }
    ]
   },
   {
    "cell_type": "markdown",
+   "metadata": {
+    "id": "BdN3hKxtgH7f"
+   },
    "source": [
     "# Open-Assistant Data Scheme\n",
     "\n",
     "Testing putting the data into the Open-Assistant Data Scheme\n",
     "\n",
     "https://github.com/LAION-AI/Open-Assistant/blob/main/docs/data_schemas.md"
-   ],
-   "metadata": {
-    "id": "BdN3hKxtgH7f"
-   }
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "n8ubYQxegNSY"
+   },
+   "outputs": [],
    "source": [
     "from typing import TypeVar, List, Dict, Any, Literal\n",
     "from json import JSONEncoder\n",
@@ -1754,15 +1753,53 @@
     "class TreeEncoder(JSONEncoder):\n",
     "    def default(self, o):\n",
     "        return o.__dict__"
-   ],
-   "metadata": {
-    "id": "n8ubYQxegNSY"
-   },
-   "execution_count": null,
-   "outputs": []
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "eE0fkytExSGl",
+    "outputId": "594632d6-f98c-49b8-af86-25f7f5e2ce06"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\n",
+      "    \"root\": {\n",
+      "        \"text\": \"Science Fiction has frequently shown AI to be a threat to the very existence of mankind. AI systems have often been the antagonists in many works of fiction, from 2001: A Space Odyssey through to The Terminator and beyond.\\nThe Media seems to buy into this trope as well.  And in recent years we have had people like Elon Musk warn us of the dangers of an impending AI revolution, stating that AI is more dangerous than nukes.\\nAnd, apparently, experts think that we will be seeing this AI revolution in the next 100 years.\\nHowever, from my (albeit limited) study of AI, I get the impression that they are all wrong. I am going to outline my understanding below, please correct me if I am wrong:\\n\\nFirstly, all of these things seem to be confusing Artificial Intelligence with Artificial Consciousness.  AI is essentially a system to make intelligent decisions, whereas AC is more like the \\\"self-aware\\\" systems that are shown in science fiction.\\n\\nNot AI itself, but intelligence and intelligent decision-making algorithms are something we've been working with and enhancing since before computers have been around.  Moving this over to an artificial framework is fairly easy.  However, consciousness is still something we are learning about.  My guess is we won't be able to re-create something artificially if we barely understand how it works in the real world.\\n\\nSo, my conclusion is that no AI system will be able to learn enough to start thinking for itself, and that all our warnings of AI are completely unjustified.\\n\\nThe real danger comes from AC, which we are a long, long way from realizing because we are still a long way off from defining exactly what consciousness is, let alone understanding it.\\n\\n\\n\\nSo, my question is, assuming that my understanding is correct, are any efforts are being made by companies or organizations that work with AI to correct these popular misunderstandings in sci-fi, the media, and/or the public?\\nOr are the proponents of AI ambivalent towards this public fear-mongering?\\nI understand that the fear mongering is going to remain popular for some time, as bad news sells better than good news. I am just wondering if the general attitude from AI organizations is to ignore this popular misconception, or whether a concerted effort is being made to fight against these AI myths (but unfortunately nobody in the media is listening or cares).\\n\",\n",
+      "        \"role\": \"prompter\",\n",
+      "        \"children\": [\n",
+      "            {\n",
+      "                \"text\": \"Nothing.  \\nIts in almost everyone's favor for it to stay that way financially. Having non-technical individuals associate AI with terminators makes a perception that the field has greater capabilities than it does $\\\\rightarrow$ this leads to grants, funding, etc...  \\nIs there any negative? Yes. Misconceptions always have drawbacks. We see the creation of dumb ethics boards and such cough cough Elon Musk.\\nBut if history has anything to say about this, as the field gains popularity (which it is dnagerously quick), information will spread by definition, and eventually misconceptions will be laid to rest.\\nNote that this answer is biased and based upon my own opinions\\n\",\n",
+      "                \"role\": \"assistant\",\n",
+      "                \"children\": [],\n",
+      "                \"metadata\": {\n",
+      "                    \"AnswerScore\": 2.0,\n",
+      "                    \"AcceptedAnswerFlag\": true\n",
+      "                }\n",
+      "            }\n",
+      "        ],\n",
+      "        \"metadata\": {\n",
+      "            \"QuestionScore\": 5,\n",
+      "            \"QuestionTags\": \"social, artificial consciousness\"\n",
+      "        }\n",
+      "    },\n",
+      "    \"metadata\": {\n",
+      "        \"Title\": \"\\\"AI will kill us all! The machines will rise up!\\\" - what is being done to dispel such myths?\",\n",
+      "        \"QuestionContentLicense\": \"CC BY-SA 4.0\",\n",
+      "        \"DataSource\": \"https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/ai.stackexchange.com.7z&file=Posts.xml\",\n",
+      "        \"CreationDate\": \"2019-10-16T13:57:37.143\"\n",
+      "    }\n",
+      "}\n"
+     ]
+    }
+   ],
    "source": [
     "conversation_forest = []\n",
     "\n",
@@ -1796,50 +1833,21 @@
     "\n",
     "\n",
     "print(json.dumps(conversation_forest_json, indent=4), file=open(f\"/content/{file_name}.json\", \"w\"))"
-   ],
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "eE0fkytExSGl",
-    "outputId": "594632d6-f98c-49b8-af86-25f7f5e2ce06"
-   },
-   "execution_count": null,
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "{\n",
-      "    \"root\": {\n",
-      "        \"text\": \"Science Fiction has frequently shown AI to be a threat to the very existence of mankind. AI systems have often been the antagonists in many works of fiction, from 2001: A Space Odyssey through to The Terminator and beyond.\\nThe Media seems to buy into this trope as well.  And in recent years we have had people like Elon Musk warn us of the dangers of an impending AI revolution, stating that AI is more dangerous than nukes.\\nAnd, apparently, experts think that we will be seeing this AI revolution in the next 100 years.\\nHowever, from my (albeit limited) study of AI, I get the impression that they are all wrong. I am going to outline my understanding below, please correct me if I am wrong:\\n\\nFirstly, all of these things seem to be confusing Artificial Intelligence with Artificial Consciousness.  AI is essentially a system to make intelligent decisions, whereas AC is more like the \\\"self-aware\\\" systems that are shown in science fiction.\\n\\nNot AI itself, but intelligence and intelligent decision-making algorithms are something we've been working with and enhancing since before computers have been around.  Moving this over to an artificial framework is fairly easy.  However, consciousness is still something we are learning about.  My guess is we won't be able to re-create something artificially if we barely understand how it works in the real world.\\n\\nSo, my conclusion is that no AI system will be able to learn enough to start thinking for itself, and that all our warnings of AI are completely unjustified.\\n\\nThe real danger comes from AC, which we are a long, long way from realizing because we are still a long way off from defining exactly what consciousness is, let alone understanding it.\\n\\n\\n\\nSo, my question is, assuming that my understanding is correct, are any efforts are being made by companies or organizations that work with AI to correct these popular misunderstandings in sci-fi, the media, and/or the public?\\nOr are the proponents of AI ambivalent towards this public fear-mongering?\\nI understand that the fear mongering is going to remain popular for some time, as bad news sells better than good news. I am just wondering if the general attitude from AI organizations is to ignore this popular misconception, or whether a concerted effort is being made to fight against these AI myths (but unfortunately nobody in the media is listening or cares).\\n\",\n",
-      "        \"role\": \"prompter\",\n",
-      "        \"children\": [\n",
-      "            {\n",
-      "                \"text\": \"Nothing.  \\nIts in almost everyone's favor for it to stay that way financially. Having non-technical individuals associate AI with terminators makes a perception that the field has greater capabilities than it does $\\\\rightarrow$ this leads to grants, funding, etc...  \\nIs there any negative? Yes. Misconceptions always have drawbacks. We see the creation of dumb ethics boards and such cough cough Elon Musk.\\nBut if history has anything to say about this, as the field gains popularity (which it is dnagerously quick), information will spread by definition, and eventually misconceptions will be laid to rest.\\nNote that this answer is biased and based upon my own opinions\\n\",\n",
-      "                \"role\": \"assistant\",\n",
-      "                \"children\": [],\n",
-      "                \"metadata\": {\n",
-      "                    \"AnswerScore\": 2.0,\n",
-      "                    \"AcceptedAnswerFlag\": true\n",
-      "                }\n",
-      "            }\n",
-      "        ],\n",
-      "        \"metadata\": {\n",
-      "            \"QuestionScore\": 5,\n",
-      "            \"QuestionTags\": \"social, artificial consciousness\"\n",
-      "        }\n",
-      "    },\n",
-      "    \"metadata\": {\n",
-      "        \"Title\": \"\\\"AI will kill us all! The machines will rise up!\\\" - what is being done to dispel such myths?\",\n",
-      "        \"QuestionContentLicense\": \"CC BY-SA 4.0\",\n",
-      "        \"DataSource\": \"https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/ai.stackexchange.com.7z&file=Posts.xml\",\n",
-      "        \"CreationDate\": \"2019-10-16T13:57:37.143\"\n",
-      "    }\n",
-      "}\n"
-     ]
-    }
    ]
   }
- ]
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
diff --git a/notebooks/detoxify-evaluation/DetoxityEvaluation.ipynb b/notebooks/detoxify-evaluation/DetoxityEvaluation.ipynb
index 907c4365..5ec0f019 100644
--- a/notebooks/detoxify-evaluation/DetoxityEvaluation.ipynb
+++ b/notebooks/detoxify-evaluation/DetoxityEvaluation.ipynb
@@ -1,5 +1,23 @@
 {
  "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/detoxify-evaluation/DetoxityEvaluation.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# uncomment below to install required python packages\n",
+    "#!pip install detoxify"
+   ]
+  },
   {
    "attachments": {},
    "cell_type": "markdown",