diff --git a/notebooks/code-bugger/openbugger_example.ipynb b/notebooks/code-bugger/openbugger_example.ipynb index 22e9b0c8..6e2acd27 100644 --- a/notebooks/code-bugger/openbugger_example.ipynb +++ b/notebooks/code-bugger/openbugger_example.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/code-bugger/openbugger_example.ipynb)" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/notebooks/data-argumentation/EssayInstructions.ipynb b/notebooks/data-argumentation/EssayInstructions.ipynb index c4179382..30834d32 100644 --- a/notebooks/data-argumentation/EssayInstructions.ipynb +++ b/notebooks/data-argumentation/EssayInstructions.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/data-argumentation/EssayInstructions.ipynb)" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/notebooks/data-argumentation/EssayRevision.ipynb b/notebooks/data-argumentation/EssayRevision.ipynb index cba9bc5b..2397131c 100644 --- a/notebooks/data-argumentation/EssayRevision.ipynb +++ b/notebooks/data-argumentation/EssayRevision.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/data-argumentation/EssayRevision.ipynb)" + ] + }, { "cell_type": "markdown", "metadata": { diff --git a/notebooks/data-argumentation/StackExchangeBuilder.ipynb b/notebooks/data-argumentation/StackExchangeBuilder.ipynb index 625d757b..b0dd9a8b 100644 --- a/notebooks/data-argumentation/StackExchangeBuilder.ipynb +++ b/notebooks/data-argumentation/StackExchangeBuilder.ipynb @@ -1,28 +1,22 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, "cells": [ { + "attachments": {}, "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/data-argumentation/StackExchangeBuilder.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TB7CEfs8F-8u" + }, "source": [ "# Ingest StackExchange data dumps\n", "This notebook takes a StackExchange Data dump \"Posts.xml\" file and ingests it into a Pandas Dataframe. Outputs of the file can be JSON, JSONL, Parquet, or CSV. " - ], - "metadata": { - "id": "TB7CEfs8F-8u" - } + ] }, { "cell_type": "code", @@ -40,16 +34,34 @@ }, { "cell_type": "markdown", + "metadata": { + "id": "15mAL7GnzBv0" + }, "source": [ "# Extract StackExchange\n", "Pull StackExchange file dumps. Specific column types are enforced to prevent errors on processing later in the notebook" - ], - "metadata": { - "id": "15mAL7GnzBv0" - } + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FtcvUEaHVxcW", + "outputId": "5b0cb19d-e3d9-422b-9077-52241bd09e0e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "dict_keys(['3dprinting_meta', '3dprinting', 'Stackoverflow_com_Posts_7z', 'academia_meta', 'academia', 'ai_meta', 'ai', 'android_meta', 'android', 'anime_meta', 'anime', 'apple_meta', 'apple', 'arduino_meta', 'arduino', 'askubuntu_com_7z', 'astronomy_meta', 'astronomy', 'aviation_meta', 'aviation', 'avp_meta', 'avp', 'beer_meta', 'beer', 'bicycles_meta', 'bicycles', 'bioacoustics_meta', 'bioacoustics', 'bioinformatics_meta', 'bioinformatics', 'biology_meta', 'biology', 'bitcoin_meta', 'bitcoin', 'blender_meta', 'blender', 'boardgames_meta', 'boardgames', 'bricks_meta', 'bricks', 'buddhism_meta', 'buddhism', 'cardano_meta', 'cardano', 'chemistry_meta', 'chemistry', 'chess_meta', 'chess', 'chinese_meta', 'chinese', 'christianity_meta', 'christianity', 'civicrm_meta', 'civicrm', 'codegolf_meta', 'codegolf', 'codereview_meta', 'codereview', 'coffee_meta', 'coffee', 'cogsci_meta', 'cogsci', 'computergraphics_meta', 'computergraphics', 'conlang_meta', 'conlang', 'cooking_meta', 'cooking', 'craftcms_meta', 'craftcms', 'crafts_meta', 'crafts', 'crypto_meta', 'crypto', 'cs_meta', 'cs', 'cseducators_meta', 'cseducators', 'cstheory_meta', 'cstheory', 'datascience_meta', 'datascience', 'dba_meta', 'dba', 'devops_meta', 'devops', 'diy_meta', 'diy', 'drones_meta', 'drones', 'drupal_meta', 'drupal', 'dsp_meta', 'dsp', 'earthscience_meta', 'earthscience', 'ebooks_meta', 'ebooks', 'economics_meta', 'economics', 'electronics_meta', 'electronics', 'elementaryos_meta', 'elementaryos', 'ell_meta', 'ell', 'emacs_meta', 'emacs', 'engineering_meta', 'engineering', 'english_meta', 'english', 'eosio_meta', 'eosio', 'es_meta_stackoverflow_com_7z', 'es_stackoverflow_com_7z', 'esperanto_meta', 'esperanto', 'ethereum_meta', 'ethereum', 'expatriates_meta', 'expatriates', 'expressionengine_meta', 'expressionengine', 'fitness_meta', 'fitness', 'freelancing_meta', 'freelancing', 'french_meta', 'french', 'gamedev_meta', 'gamedev', 'gaming_meta', 'gaming', 'gardening_meta', 'gardening', 'genealogy_meta', 'genealogy', 'german_meta', 'german', 'gis_meta', 'gis', 'graphicdesign_meta', 'graphicdesign', 'ham_meta', 'ham', 'hardwarerecs_meta', 'hardwarerecs', 'health_meta', 'health', 'hermeneutics_meta', 'hermeneutics', 'hinduism_meta', 'hinduism', 'history_meta', 'history', 'homebrew_meta', 'homebrew', 'hsm_meta', 'hsm', 'interpersonal_meta', 'interpersonal', 'iot_meta', 'iot', 'iota_meta', 'iota', 'islam_meta', 'islam', 'italian_meta', 'italian', 'ja_meta_stackoverflow_com_7z', 'ja_stackoverflow_com_7z', 'japanese_meta', 'japanese', 'joomla_meta', 'joomla', 'judaism_meta', 'judaism', 'korean_meta', 'korean', 'languagelearning_meta', 'languagelearning', 'latin_meta', 'latin', 'law_meta', 'law', 'lifehacks_meta', 'lifehacks', 'linguistics_meta', 'linguistics', 'literature_meta', 'literature', 'magento_meta', 'magento', 'martialarts_meta', 'martialarts', 'materials_meta', 'materials', 'math_meta', 'math', 'matheducators_meta', 'matheducators', 'mathematica_meta', 'mathematica', 'mathoverflow_net_7z', 'mechanics_meta', 'mechanics', 'meta_askubuntu_com_7z', 'meta_mathoverflow_net_7z', 'meta_serverfault_com_7z', 'meta', 'meta_stackoverflow_com_7z', 'meta_superuser_com_7z', 'moderators_meta', 'moderators', 'monero_meta', 'monero', 'money_meta', 'money', 'movies_meta', 'movies', 'music_meta', 'music', 'musicfans_meta', 'musicfans', 'mythology_meta', 'mythology', 'networkengineering_meta', 'networkengineering', 'opendata_meta', 'opendata', 'opensource_meta', 'opensource', 'or_meta', 'or', 'outdoors_meta', 'outdoors', 'parenting_meta', 'parenting', 'patents_meta', 'patents', 'pets_meta', 'pets', 'philosophy_meta', 'philosophy', 'photo_meta', 'photo', 'physics_meta', 'physics', 'pm_meta', 'pm', 'poker_meta', 'poker', 'politics_meta', 'politics', 'portuguese_meta', 'portuguese', 'proofassistants_meta', 'proofassistants', 'pt_meta_stackoverflow_com_7z', 'pt_stackoverflow_com_7z', 'puzzling_meta', 'puzzling', 'quant_meta', 'quant', 'quantumcomputing_meta', 'quantumcomputing', 'raspberrypi_meta', 'raspberrypi', 'retrocomputing_meta', 'retrocomputing', 'reverseengineering_meta', 'reverseengineering', 'robotics_meta', 'robotics', 'rpg_meta', 'rpg', 'ru_meta_stackoverflow_com_7z', 'ru_stackoverflow_com_7z', 'rus_meta', 'rus', 'russian_meta', 'russian', 'salesforce_meta', 'salesforce', 'scicomp_meta', 'scicomp', 'scifi_meta', 'scifi', 'security_meta', 'security', 'serverfault_com_7z', 'sharepoint_meta', 'sharepoint', 'sitecore_meta', 'sitecore', 'skeptics_meta', 'skeptics', 'softwareengineering_meta', 'softwareengineering', 'softwarerecs_meta', 'softwarerecs', 'solana_meta', 'solana', 'sound_meta', 'sound', 'space_meta', 'space', 'spanish_meta', 'spanish', 'sports_meta', 'sports', 'sqa_meta', 'sqa', 'stackapps_com_7z', 'stackoverflow_com_Badges_7z', 'stackoverflow_com_Comments_7z', 'stackoverflow_com_PostHistory_7z', 'stackoverflow_com_PostLinks_7z', 'stackoverflow_com_Tags_7z', 'stackoverflow_com_Users_7z', 'stackoverflow_com_Votes_7z', 'stats_meta', 'stats', 'stellar_meta', 'stellar', 'substrate_meta', 'substrate', 'superuser_com_7z', 'sustainability_meta', 'sustainability', 'tex_meta', 'tex', 'tezos_meta', 'tezos', 'tor_meta', 'tor', 'travel_meta', 'travel', 'tridion_meta', 'tridion', 'ukrainian_meta', 'ukrainian', 'unix_meta', 'unix', 'ux_meta', 'ux', 'vegetarianism_meta', 'vegetarianism', 'vi_meta', 'vi', 'webapps_meta', 'webapps', 'webmasters_meta', 'webmasters', 'windowsphone_meta', 'windowsphone', 'woodworking_meta', 'woodworking', 'wordpress_meta', 'wordpress', 'workplace_meta', 'workplace', 'worldbuilding_meta', 'worldbuilding', 'writers_meta', 'writers'])\n", + "https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/ai.stackexchange.com.7z&file=Posts.xml\n" + ] + } + ], "source": [ "base_url = \"https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/{0}&file=Posts.xml\"\n", "\n", @@ -73,90 +85,11 @@ "\n", "print(urls.keys())\n", "print(urls.get(\"ai\"))" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "FtcvUEaHVxcW", - "outputId": "5b0cb19d-e3d9-422b-9077-52241bd09e0e" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "dict_keys(['3dprinting_meta', '3dprinting', 'Stackoverflow_com_Posts_7z', 'academia_meta', 'academia', 'ai_meta', 'ai', 'android_meta', 'android', 'anime_meta', 'anime', 'apple_meta', 'apple', 'arduino_meta', 'arduino', 'askubuntu_com_7z', 'astronomy_meta', 'astronomy', 'aviation_meta', 'aviation', 'avp_meta', 'avp', 'beer_meta', 'beer', 'bicycles_meta', 'bicycles', 'bioacoustics_meta', 'bioacoustics', 'bioinformatics_meta', 'bioinformatics', 'biology_meta', 'biology', 'bitcoin_meta', 'bitcoin', 'blender_meta', 'blender', 'boardgames_meta', 'boardgames', 'bricks_meta', 'bricks', 'buddhism_meta', 'buddhism', 'cardano_meta', 'cardano', 'chemistry_meta', 'chemistry', 'chess_meta', 'chess', 'chinese_meta', 'chinese', 'christianity_meta', 'christianity', 'civicrm_meta', 'civicrm', 'codegolf_meta', 'codegolf', 'codereview_meta', 'codereview', 'coffee_meta', 'coffee', 'cogsci_meta', 'cogsci', 'computergraphics_meta', 'computergraphics', 'conlang_meta', 'conlang', 'cooking_meta', 'cooking', 'craftcms_meta', 'craftcms', 'crafts_meta', 'crafts', 'crypto_meta', 'crypto', 'cs_meta', 'cs', 'cseducators_meta', 'cseducators', 'cstheory_meta', 'cstheory', 'datascience_meta', 'datascience', 'dba_meta', 'dba', 'devops_meta', 'devops', 'diy_meta', 'diy', 'drones_meta', 'drones', 'drupal_meta', 'drupal', 'dsp_meta', 'dsp', 'earthscience_meta', 'earthscience', 'ebooks_meta', 'ebooks', 'economics_meta', 'economics', 'electronics_meta', 'electronics', 'elementaryos_meta', 'elementaryos', 'ell_meta', 'ell', 'emacs_meta', 'emacs', 'engineering_meta', 'engineering', 'english_meta', 'english', 'eosio_meta', 'eosio', 'es_meta_stackoverflow_com_7z', 'es_stackoverflow_com_7z', 'esperanto_meta', 'esperanto', 'ethereum_meta', 'ethereum', 'expatriates_meta', 'expatriates', 'expressionengine_meta', 'expressionengine', 'fitness_meta', 'fitness', 'freelancing_meta', 'freelancing', 'french_meta', 'french', 'gamedev_meta', 'gamedev', 'gaming_meta', 'gaming', 'gardening_meta', 'gardening', 'genealogy_meta', 'genealogy', 'german_meta', 'german', 'gis_meta', 'gis', 'graphicdesign_meta', 'graphicdesign', 'ham_meta', 'ham', 'hardwarerecs_meta', 'hardwarerecs', 'health_meta', 'health', 'hermeneutics_meta', 'hermeneutics', 'hinduism_meta', 'hinduism', 'history_meta', 'history', 'homebrew_meta', 'homebrew', 'hsm_meta', 'hsm', 'interpersonal_meta', 'interpersonal', 'iot_meta', 'iot', 'iota_meta', 'iota', 'islam_meta', 'islam', 'italian_meta', 'italian', 'ja_meta_stackoverflow_com_7z', 'ja_stackoverflow_com_7z', 'japanese_meta', 'japanese', 'joomla_meta', 'joomla', 'judaism_meta', 'judaism', 'korean_meta', 'korean', 'languagelearning_meta', 'languagelearning', 'latin_meta', 'latin', 'law_meta', 'law', 'lifehacks_meta', 'lifehacks', 'linguistics_meta', 'linguistics', 'literature_meta', 'literature', 'magento_meta', 'magento', 'martialarts_meta', 'martialarts', 'materials_meta', 'materials', 'math_meta', 'math', 'matheducators_meta', 'matheducators', 'mathematica_meta', 'mathematica', 'mathoverflow_net_7z', 'mechanics_meta', 'mechanics', 'meta_askubuntu_com_7z', 'meta_mathoverflow_net_7z', 'meta_serverfault_com_7z', 'meta', 'meta_stackoverflow_com_7z', 'meta_superuser_com_7z', 'moderators_meta', 'moderators', 'monero_meta', 'monero', 'money_meta', 'money', 'movies_meta', 'movies', 'music_meta', 'music', 'musicfans_meta', 'musicfans', 'mythology_meta', 'mythology', 'networkengineering_meta', 'networkengineering', 'opendata_meta', 'opendata', 'opensource_meta', 'opensource', 'or_meta', 'or', 'outdoors_meta', 'outdoors', 'parenting_meta', 'parenting', 'patents_meta', 'patents', 'pets_meta', 'pets', 'philosophy_meta', 'philosophy', 'photo_meta', 'photo', 'physics_meta', 'physics', 'pm_meta', 'pm', 'poker_meta', 'poker', 'politics_meta', 'politics', 'portuguese_meta', 'portuguese', 'proofassistants_meta', 'proofassistants', 'pt_meta_stackoverflow_com_7z', 'pt_stackoverflow_com_7z', 'puzzling_meta', 'puzzling', 'quant_meta', 'quant', 'quantumcomputing_meta', 'quantumcomputing', 'raspberrypi_meta', 'raspberrypi', 'retrocomputing_meta', 'retrocomputing', 'reverseengineering_meta', 'reverseengineering', 'robotics_meta', 'robotics', 'rpg_meta', 'rpg', 'ru_meta_stackoverflow_com_7z', 'ru_stackoverflow_com_7z', 'rus_meta', 'rus', 'russian_meta', 'russian', 'salesforce_meta', 'salesforce', 'scicomp_meta', 'scicomp', 'scifi_meta', 'scifi', 'security_meta', 'security', 'serverfault_com_7z', 'sharepoint_meta', 'sharepoint', 'sitecore_meta', 'sitecore', 'skeptics_meta', 'skeptics', 'softwareengineering_meta', 'softwareengineering', 'softwarerecs_meta', 'softwarerecs', 'solana_meta', 'solana', 'sound_meta', 'sound', 'space_meta', 'space', 'spanish_meta', 'spanish', 'sports_meta', 'sports', 'sqa_meta', 'sqa', 'stackapps_com_7z', 'stackoverflow_com_Badges_7z', 'stackoverflow_com_Comments_7z', 'stackoverflow_com_PostHistory_7z', 'stackoverflow_com_PostLinks_7z', 'stackoverflow_com_Tags_7z', 'stackoverflow_com_Users_7z', 'stackoverflow_com_Votes_7z', 'stats_meta', 'stats', 'stellar_meta', 'stellar', 'substrate_meta', 'substrate', 'superuser_com_7z', 'sustainability_meta', 'sustainability', 'tex_meta', 'tex', 'tezos_meta', 'tezos', 'tor_meta', 'tor', 'travel_meta', 'travel', 'tridion_meta', 'tridion', 'ukrainian_meta', 'ukrainian', 'unix_meta', 'unix', 'ux_meta', 'ux', 'vegetarianism_meta', 'vegetarianism', 'vi_meta', 'vi', 'webapps_meta', 'webapps', 'webmasters_meta', 'webmasters', 'windowsphone_meta', 'windowsphone', 'woodworking_meta', 'woodworking', 'wordpress_meta', 'wordpress', 'workplace_meta', 'workplace', 'worldbuilding_meta', 'worldbuilding', 'writers_meta', 'writers'])\n", - "https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/ai.stackexchange.com.7z&file=Posts.xml\n" - ] - } ] }, { "cell_type": "code", - "source": [ - "xml_format_map = {\n", - " \"Id\": int,\n", - " \"PostTypeId\": int,\n", - " \"CreationDate\": str,\n", - " \"Score\": int,\n", - " \"ViewCount\": int,\n", - " \"Body\": str,\n", - " \"AnswerCount\": int,\n", - " \"CommentCount\": int,\n", - " \"ContentLicense\": str,\n", - " \"AcceptedAnswerId\": int,\n", - " \"ParentId\": int,\n", - "}\n", - "\n", - "\n", - "# def extract_xml_file(file_url: str):\n", - "# table = pd.read_xml(file_url)\n", - "# return table\n", - "\n", - "\n", - "def xml_to_df(response: str):\n", - " \"\"\"\n", - " Collect and Manually import XML into Dataframe\n", - "\n", - " pd.read_xml() errors when XML trees are too large, this is just a hack to\n", - " download a XML file and parse into a Dataframe. **Not Tested on huge XML files**\n", - "\n", - " Parameters:\n", - " response (Requests.Response): Requests response object with the XML data\n", - "\n", - " Returns:\n", - " df (DataFrame): A Dataframe from the XML file\n", - " \"\"\"\n", - " soup = bs(response.content, \"xml\")\n", - " posts = soup.find_all(\"row\")\n", - "\n", - " all_posts = [post.attrs for post in posts]\n", - "\n", - " df = pd.DataFrame(all_posts)\n", - " df.AnswerCount.fillna(0, inplace=True)\n", - " df.ViewCount.fillna(0, inplace=True)\n", - " df.AcceptedAnswerId.fillna(0, inplace=True)\n", - " df.ParentId.fillna(0, inplace=True)\n", - " df[\"DataSource\"] = response.url\n", - " df = df.astype(xml_format_map)\n", - " return df\n", - "\n", - "\n", - "dataset_name = \"ai\"\n", - "\n", - "xml_posts_path = urls.get(dataset_name)\n", - "\n", - "\n", - "# df = extract_xml_file(test)\n", - "response = requests.get(xml_posts_path)\n", - "df = xml_to_df(response)\n", - "\n", - "\n", - "print(df.dtypes)\n", - "df.head()" - ], + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -165,11 +98,10 @@ "id": "-t27RnxdzBYB", "outputId": "5ec0ceed-c82b-48fa-facd-41b4aae2f9e6" }, - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Id int64\n", "PostTypeId int64\n", @@ -198,53 +130,7 @@ ] }, { - "output_type": "execute_result", "data": { - "text/plain": [ - " Id PostTypeId AcceptedAnswerId CreationDate Score \\\n", - "0 1 1 3 2016-08-02T15:39:14.947 10 \n", - "1 2 1 9 2016-08-02T15:40:20.623 14 \n", - "2 3 2 0 2016-08-02T15:40:24.820 15 \n", - "3 4 1 12 2016-08-02T15:41:22.020 33 \n", - "4 6 1 20 2016-08-02T15:43:35.460 7 \n", - "\n", - " ViewCount Body OwnerUserId \\\n", - "0 710

What does \"backprop\" mean? Is the \"backprop... 8 \n", - "1 1008

Does increasing the noise in data help to i... 8 \n", - "2 0

\"Backprop\" is the same as \"backpropagation\"... 4 \n", - "3 1266

When you're writing your algorithm, how do ... 8 \n", - "4 279

Given the following definition of an intell... 29 \n", - "\n", - " LastEditorUserId LastEditDate ... AnswerCount CommentCount \\\n", - "0 2444 2019-11-16T17:56:22.093 ... 5 0 \n", - "1 2444 2019-02-23T22:36:19.090 ... 3 0 \n", - "2 NaN NaN ... 0 0 \n", - "3 2444 2021-01-19T23:54:07.813 ... 4 0 \n", - "4 2444 2019-06-15T18:25:58.513 ... 2 0 \n", - "\n", - " ContentLicense ParentId ClosedDate FavoriteCount CommunityOwnedDate \\\n", - "0 CC BY-SA 4.0 0 NaN NaN NaN \n", - "1 CC BY-SA 4.0 0 NaN NaN NaN \n", - "2 CC BY-SA 3.0 1 NaN NaN NaN \n", - "3 CC BY-SA 3.0 0 NaN NaN NaN \n", - "4 CC BY-SA 4.0 0 NaN NaN NaN \n", - "\n", - " LastEditorDisplayName OwnerDisplayName \\\n", - "0 NaN NaN \n", - "1 NaN NaN \n", - "2 NaN NaN \n", - "3 NaN NaN \n", - "4 NaN NaN \n", - "\n", - " DataSource \n", - "0 https://ia600107.us.archive.org/view_archive.p... \n", - "1 https://ia600107.us.archive.org/view_archive.p... \n", - "2 https://ia600107.us.archive.org/view_archive.p... \n", - "3 https://ia600107.us.archive.org/view_archive.p... \n", - "4 https://ia600107.us.archive.org/view_archive.p... \n", - "\n", - "[5 rows x 23 columns]" - ], "text/html": [ "\n", "

\n", @@ -491,99 +377,133 @@ "
\n", " \n", " " + ], + "text/plain": [ + " Id PostTypeId AcceptedAnswerId CreationDate Score \\\n", + "0 1 1 3 2016-08-02T15:39:14.947 10 \n", + "1 2 1 9 2016-08-02T15:40:20.623 14 \n", + "2 3 2 0 2016-08-02T15:40:24.820 15 \n", + "3 4 1 12 2016-08-02T15:41:22.020 33 \n", + "4 6 1 20 2016-08-02T15:43:35.460 7 \n", + "\n", + " ViewCount Body OwnerUserId \\\n", + "0 710

What does \"backprop\" mean? Is the \"backprop... 8 \n", + "1 1008

Does increasing the noise in data help to i... 8 \n", + "2 0

\"Backprop\" is the same as \"backpropagation\"... 4 \n", + "3 1266

When you're writing your algorithm, how do ... 8 \n", + "4 279

Given the following definition of an intell... 29 \n", + "\n", + " LastEditorUserId LastEditDate ... AnswerCount CommentCount \\\n", + "0 2444 2019-11-16T17:56:22.093 ... 5 0 \n", + "1 2444 2019-02-23T22:36:19.090 ... 3 0 \n", + "2 NaN NaN ... 0 0 \n", + "3 2444 2021-01-19T23:54:07.813 ... 4 0 \n", + "4 2444 2019-06-15T18:25:58.513 ... 2 0 \n", + "\n", + " ContentLicense ParentId ClosedDate FavoriteCount CommunityOwnedDate \\\n", + "0 CC BY-SA 4.0 0 NaN NaN NaN \n", + "1 CC BY-SA 4.0 0 NaN NaN NaN \n", + "2 CC BY-SA 3.0 1 NaN NaN NaN \n", + "3 CC BY-SA 3.0 0 NaN NaN NaN \n", + "4 CC BY-SA 4.0 0 NaN NaN NaN \n", + "\n", + " LastEditorDisplayName OwnerDisplayName \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "\n", + " DataSource \n", + "0 https://ia600107.us.archive.org/view_archive.p... \n", + "1 https://ia600107.us.archive.org/view_archive.p... \n", + "2 https://ia600107.us.archive.org/view_archive.p... \n", + "3 https://ia600107.us.archive.org/view_archive.p... \n", + "4 https://ia600107.us.archive.org/view_archive.p... \n", + "\n", + "[5 rows x 23 columns]" ] }, + "execution_count": 219, "metadata": {}, - "execution_count": 219 + "output_type": "execute_result" } + ], + "source": [ + "xml_format_map = {\n", + " \"Id\": int,\n", + " \"PostTypeId\": int,\n", + " \"CreationDate\": str,\n", + " \"Score\": int,\n", + " \"ViewCount\": int,\n", + " \"Body\": str,\n", + " \"AnswerCount\": int,\n", + " \"CommentCount\": int,\n", + " \"ContentLicense\": str,\n", + " \"AcceptedAnswerId\": int,\n", + " \"ParentId\": int,\n", + "}\n", + "\n", + "\n", + "# def extract_xml_file(file_url: str):\n", + "# table = pd.read_xml(file_url)\n", + "# return table\n", + "\n", + "\n", + "def xml_to_df(response: str):\n", + " \"\"\"\n", + " Collect and Manually import XML into Dataframe\n", + "\n", + " pd.read_xml() errors when XML trees are too large, this is just a hack to\n", + " download a XML file and parse into a Dataframe. **Not Tested on huge XML files**\n", + "\n", + " Parameters:\n", + " response (Requests.Response): Requests response object with the XML data\n", + "\n", + " Returns:\n", + " df (DataFrame): A Dataframe from the XML file\n", + " \"\"\"\n", + " soup = bs(response.content, \"xml\")\n", + " posts = soup.find_all(\"row\")\n", + "\n", + " all_posts = [post.attrs for post in posts]\n", + "\n", + " df = pd.DataFrame(all_posts)\n", + " df.AnswerCount.fillna(0, inplace=True)\n", + " df.ViewCount.fillna(0, inplace=True)\n", + " df.AcceptedAnswerId.fillna(0, inplace=True)\n", + " df.ParentId.fillna(0, inplace=True)\n", + " df[\"DataSource\"] = response.url\n", + " df = df.astype(xml_format_map)\n", + " return df\n", + "\n", + "\n", + "dataset_name = \"ai\"\n", + "\n", + "xml_posts_path = urls.get(dataset_name)\n", + "\n", + "\n", + "# df = extract_xml_file(test)\n", + "response = requests.get(xml_posts_path)\n", + "df = xml_to_df(response)\n", + "\n", + "\n", + "print(df.dtypes)\n", + "df.head()" ] }, { "cell_type": "markdown", - "source": [ - "# Transformations" - ], "metadata": { "id": "RAzTR7zY3oan" - } + }, + "source": [ + "# Transformations" + ] }, { "cell_type": "code", - "source": [ - "def filter_only_questions_with_accepted_answers(df):\n", - " \"\"\"**TODO**\n", - " Filter only to Questions with Accepted Answers\n", - "\n", - " Filter dataframe by questions that have accepted answers, should also include\n", - " all rows of answers for those questions, even if not accepted.\n", - "\n", - " Parameters:\n", - " df (DataFrame): containing a \"AcceptedAnswerId\", \"Id\", and \"ParentId\" columns\n", - "\n", - " Returns:\n", - " df (DataFrame): current dataframe with filtered results\n", - " \"\"\"\n", - " df = df[(df[\"AcceptedAnswerId\"].notnull()) | (df[\"ParentId\"] == df[\"Id\"])]\n", - "\n", - "\n", - "def filter_scores_above(df, question_score_threshold: int = 20, answer_score_threshold: int = 20):\n", - " \"\"\"**TODO**\n", - " Filter Dataframe by minimum scores\n", - "\n", - " Filter Question and Answer columns by score thresholds to trim lower scoring results\n", - "\n", - " Parameters:\n", - " df (DataFrame): containing a \"Score\" column\n", - "\n", - " Returns:\n", - " df (DataFrame): current dataframe with filtered results\n", - " \"\"\"\n", - " df = df[\n", - " ((df[\"Score\"] >= question_score_threshold) & (df.PostTypeId == 1))\n", - " | ((df[\"Score\"] >= answer_score_threshold) & (df.PostTypeId == 2))\n", - " ]\n", - "\n", - "\n", - "def convert_html_to_text(df, column: str = \"Body\"):\n", - " \"\"\"\n", - " Convert HTML tags to pure text\n", - "\n", - " Feeds HTML text body into BeautifulSoup to parse it to only text. Set aside as\n", - " function to provide option to skip\n", - "\n", - " Parameters:\n", - " df (DataFrame): containing a \"Body\" column with HTML\n", - "\n", - " Returns:\n", - " df (DataFrame): current dataframe with parsed column\n", - " \"\"\"\n", - " df.dropna(subset=[column], inplace=True)\n", - " df[f\"{column}Clean\"] = df[column].apply(lambda row: bs(row, \"html.parser\").text)\n", - "\n", - "\n", - "def clean_tags(df):\n", - " \"\"\"\n", - " Convert Tags into Comma separated\n", - "\n", - " Converts Tag slugs into commas separated tags\n", - "\n", - " Parameters:\n", - " df (DataFrame): containing a \"Tags\" column with slugs\n", - "\n", - " Returns:\n", - " df (DataFrame): current dataframe with parsed column\n", - " \"\"\"\n", - " df[\"TagsClean\"] = df[\"Tags\"].str.replace(\"-\", \" \").str.replace(\"><\", \", \").str.replace(\"<\", \"\").str.replace(\">\", \"\")\n", - "\n", - "\n", - "# filter_only_questions_with_accepted_answers(df)\n", - "# filter_scores_above(df)\n", - "convert_html_to_text(df)\n", - "clean_tags(df)\n", - "\n", - "df[[\"Body\", \"BodyClean\", \"Tags\", \"TagsClean\"]]\n", - "# print(df.shape)" - ], + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -592,66 +512,9 @@ "id": "qyUqc31Z3Z9g", "outputId": "18dce8b4-af26-49c9-ee73-6c677177b516" }, - "execution_count": null, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - " Body \\\n", - "0

What does \"backprop\" mean? Is the \"backprop... \n", - "1

Does increasing the noise in data help to i... \n", - "2

\"Backprop\" is the same as \"backpropagation\"... \n", - "3

When you're writing your algorithm, how do ... \n", - "4

Given the following definition of an intell... \n", - "... ... \n", - "23174

The purpose of evaluating the state and act... \n", - "23175

In machine translation, convolution is a te... \n", - "23176

One of the key features of ChatGPT is its a... \n", - "23177

Given a neural network model for Covid-19 c... \n", - "23178

My question is more related to the fundamen... \n", - "\n", - " BodyClean \\\n", - "0 What does \"backprop\" mean? Is the \"backprop\" t... \n", - "1 Does increasing the noise in data help to impr... \n", - "2 \"Backprop\" is the same as \"backpropagation\": i... \n", - "3 When you're writing your algorithm, how do you... \n", - "4 Given the following definition of an intellige... \n", - "... ... \n", - "23174 The purpose of evaluating the state and action... \n", - "23175 In machine translation, convolution is a techn... \n", - "23176 One of the key features of ChatGPT is its abil... \n", - "23177 Given a neural network model for Covid-19 clas... \n", - "23178 My question is more related to the fundamental... \n", - "\n", - " Tags \\\n", - "0 ... \n", - "4 \n", - "... ... \n", - "23174 NaN \n", - "23175 NaN \n", - "23176 NaN \n", - "23177 \n", - "23178 \n", - "\n", - " TagsClean \n", - "0 neural networks, backpropagation, terminology,... \n", - "1 neural networks, machine learning, statistical... \n", - "2 NaN \n", - "3 neural networks, hyperparameter optimization, ... \n", - "4 philosophy, definitions, intelligent agent \n", - "... ... \n", - "23174 NaN \n", - "23175 NaN \n", - "23176 NaN \n", - "23177 neural networks, homework \n", - "23178 search, constraint satisfaction problems \n", - "\n", - "[23179 rows x 4 columns]" - ], "text/html": [ "\n", "

\n", @@ -838,75 +701,156 @@ "
\n", " \n", " " + ], + "text/plain": [ + " Body \\\n", + "0

What does \"backprop\" mean? Is the \"backprop... \n", + "1

Does increasing the noise in data help to i... \n", + "2

\"Backprop\" is the same as \"backpropagation\"... \n", + "3

When you're writing your algorithm, how do ... \n", + "4

Given the following definition of an intell... \n", + "... ... \n", + "23174

The purpose of evaluating the state and act... \n", + "23175

In machine translation, convolution is a te... \n", + "23176

One of the key features of ChatGPT is its a... \n", + "23177

Given a neural network model for Covid-19 c... \n", + "23178

My question is more related to the fundamen... \n", + "\n", + " BodyClean \\\n", + "0 What does \"backprop\" mean? Is the \"backprop\" t... \n", + "1 Does increasing the noise in data help to impr... \n", + "2 \"Backprop\" is the same as \"backpropagation\": i... \n", + "3 When you're writing your algorithm, how do you... \n", + "4 Given the following definition of an intellige... \n", + "... ... \n", + "23174 The purpose of evaluating the state and action... \n", + "23175 In machine translation, convolution is a techn... \n", + "23176 One of the key features of ChatGPT is its abil... \n", + "23177 Given a neural network model for Covid-19 clas... \n", + "23178 My question is more related to the fundamental... \n", + "\n", + " Tags \\\n", + "0 ... \n", + "4 \n", + "... ... \n", + "23174 NaN \n", + "23175 NaN \n", + "23176 NaN \n", + "23177 \n", + "23178 \n", + "\n", + " TagsClean \n", + "0 neural networks, backpropagation, terminology,... \n", + "1 neural networks, machine learning, statistical... \n", + "2 NaN \n", + "3 neural networks, hyperparameter optimization, ... \n", + "4 philosophy, definitions, intelligent agent \n", + "... ... \n", + "23174 NaN \n", + "23175 NaN \n", + "23176 NaN \n", + "23177 neural networks, homework \n", + "23178 search, constraint satisfaction problems \n", + "\n", + "[23179 rows x 4 columns]" ] }, + "execution_count": 220, "metadata": {}, - "execution_count": 220 + "output_type": "execute_result" } + ], + "source": [ + "def filter_only_questions_with_accepted_answers(df):\n", + " \"\"\"**TODO**\n", + " Filter only to Questions with Accepted Answers\n", + "\n", + " Filter dataframe by questions that have accepted answers, should also include\n", + " all rows of answers for those questions, even if not accepted.\n", + "\n", + " Parameters:\n", + " df (DataFrame): containing a \"AcceptedAnswerId\", \"Id\", and \"ParentId\" columns\n", + "\n", + " Returns:\n", + " df (DataFrame): current dataframe with filtered results\n", + " \"\"\"\n", + " df = df[(df[\"AcceptedAnswerId\"].notnull()) | (df[\"ParentId\"] == df[\"Id\"])]\n", + "\n", + "\n", + "def filter_scores_above(df, question_score_threshold: int = 20, answer_score_threshold: int = 20):\n", + " \"\"\"**TODO**\n", + " Filter Dataframe by minimum scores\n", + "\n", + " Filter Question and Answer columns by score thresholds to trim lower scoring results\n", + "\n", + " Parameters:\n", + " df (DataFrame): containing a \"Score\" column\n", + "\n", + " Returns:\n", + " df (DataFrame): current dataframe with filtered results\n", + " \"\"\"\n", + " df = df[\n", + " ((df[\"Score\"] >= question_score_threshold) & (df.PostTypeId == 1))\n", + " | ((df[\"Score\"] >= answer_score_threshold) & (df.PostTypeId == 2))\n", + " ]\n", + "\n", + "\n", + "def convert_html_to_text(df, column: str = \"Body\"):\n", + " \"\"\"\n", + " Convert HTML tags to pure text\n", + "\n", + " Feeds HTML text body into BeautifulSoup to parse it to only text. Set aside as\n", + " function to provide option to skip\n", + "\n", + " Parameters:\n", + " df (DataFrame): containing a \"Body\" column with HTML\n", + "\n", + " Returns:\n", + " df (DataFrame): current dataframe with parsed column\n", + " \"\"\"\n", + " df.dropna(subset=[column], inplace=True)\n", + " df[f\"{column}Clean\"] = df[column].apply(lambda row: bs(row, \"html.parser\").text)\n", + "\n", + "\n", + "def clean_tags(df):\n", + " \"\"\"\n", + " Convert Tags into Comma separated\n", + "\n", + " Converts Tag slugs into commas separated tags\n", + "\n", + " Parameters:\n", + " df (DataFrame): containing a \"Tags\" column with slugs\n", + "\n", + " Returns:\n", + " df (DataFrame): current dataframe with parsed column\n", + " \"\"\"\n", + " df[\"TagsClean\"] = df[\"Tags\"].str.replace(\"-\", \" \").str.replace(\"><\", \", \").str.replace(\"<\", \"\").str.replace(\">\", \"\")\n", + "\n", + "\n", + "# filter_only_questions_with_accepted_answers(df)\n", + "# filter_scores_above(df)\n", + "convert_html_to_text(df)\n", + "clean_tags(df)\n", + "\n", + "df[[\"Body\", \"BodyClean\", \"Tags\", \"TagsClean\"]]\n", + "# print(df.shape)" ] }, { "cell_type": "markdown", - "source": [ - "This groups questions with answers so that a row with a question also has a column with an answer. It then creates an AcceptedAnswerFlag column that is True if the answer was accepted by the person who asked the question. Changing the `number_of_results` variable will limit the number of answers you want to keep." - ], "metadata": { "id": "C09Bwdw-44PZ" - } + }, + "source": [ + "This groups questions with answers so that a row with a question also has a column with an answer. It then creates an AcceptedAnswerFlag column that is True if the answer was accepted by the person who asked the question. Changing the `number_of_results` variable will limit the number of answers you want to keep." + ] }, { "cell_type": "code", - "source": [ - "questions = df[df.PostTypeId == 1]\n", - "answers = df[df.PostTypeId == 2]\n", - "\n", - "df = pd.merge(\n", - " questions,\n", - " answers[\n", - " [\n", - " \"Id\",\n", - " \"CreationDate\",\n", - " \"Score\",\n", - " \"ViewCount\",\n", - " \"CommentCount\",\n", - " \"ContentLicense\",\n", - " \"TagsClean\",\n", - " \"BodyClean\",\n", - " \"ParentId\",\n", - " ]\n", - " ],\n", - " left_on=\"Id\",\n", - " right_on=\"ParentId\",\n", - " suffixes=(\"_q\", \"_a\"),\n", - " how=\"left\",\n", - ")\n", - "\n", - "df[\"AcceptedAnswerFlag\"] = df.apply(lambda row: row[\"Id_a\"] == row[\"AcceptedAnswerId\"], axis=1)\n", - "\n", - "df = df.rename(\n", - " columns={\n", - " \"BodyClean_q\": \"Question\",\n", - " \"Score_q\": \"QuestionScore\",\n", - " \"TagsClean_q\": \"QuestionTags\",\n", - " \"BodyClean_a\": \"Answer\",\n", - " \"Score_a\": \"AnswerScore\",\n", - " \"ContentLicense_q\": \"QuestionContentLicense\",\n", - " \"ContentLicense_a\": \"AnswerContentLicense\",\n", - " \"CreationDate_q\": \"CreationDate\",\n", - " }\n", - ")\n", - "\n", - "## Set the number of results to a lower number to only return top N rated Answers.\n", - "number_of_results = 25\n", - "df = (\n", - " df.sort_values(by=[\"AcceptedAnswerFlag\", \"AnswerScore\"], ascending=[False, False])\n", - " .groupby(\"Question\")\n", - " .head(number_of_results)\n", - " .reset_index(drop=True)\n", - ")\n", - "\n", - "df[[\"Id_q\", \"Question\", \"QuestionScore\", \"QuestionTags\", \"Id_a\", \"Answer\", \"AnswerScore\", \"AcceptedAnswerFlag\"]].head()" - ], + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -915,40 +859,9 @@ "id": "Bgz2fZ9k43Ab", "outputId": "28896d69-03cd-4877-fdfb-ae48dafa4ff3" }, - "execution_count": null, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - " Id_q Question QuestionScore \\\n", - "0 1768 In Portal 2 we see that AI's can be \"killed\" b... 175 \n", - "1 10623 What is self-supervised learning in machine le... 91 \n", - "2 111 Obviously, self-driving cars aren't perfect, s... 100 \n", - "3 14224 If the original purpose for developing AI was ... 69 \n", - "4 1479 Do scientists or research experts know from th... 94 \n", - "\n", - " QuestionTags Id_a \\\n", - "0 philosophy, decision theory, mythology of ai, ... 1769.0 \n", - "1 machine learning, comparison, supervised learn... 10624.0 \n", - "2 philosophy, ethics, autonomous vehicles, decis... 1790.0 \n", - "3 philosophy, social, explainable ai 14247.0 \n", - "4 neural networks, deep learning, convolutional ... 4044.0 \n", - "\n", - " Answer AnswerScore \\\n", - "0 This classic problem exhibits a basic misunder... 146.0 \n", - "1 Introduction\\nThe term self-supervised learnin... 90.0 \n", - "2 \\nHow could self-driving cars make ethical dec... 76.0 \n", - "3 As argued by Selvaraju et al., there are three... 75.0 \n", - "4 There are many approaches that aim to make a t... 69.0 \n", - "\n", - " AcceptedAnswerFlag \n", - "0 True \n", - "1 True \n", - "2 True \n", - "3 True \n", - "4 True " - ], "text/html": [ "\n", "

\n", @@ -1116,22 +1029,97 @@ "
\n", " \n", " " + ], + "text/plain": [ + " Id_q Question QuestionScore \\\n", + "0 1768 In Portal 2 we see that AI's can be \"killed\" b... 175 \n", + "1 10623 What is self-supervised learning in machine le... 91 \n", + "2 111 Obviously, self-driving cars aren't perfect, s... 100 \n", + "3 14224 If the original purpose for developing AI was ... 69 \n", + "4 1479 Do scientists or research experts know from th... 94 \n", + "\n", + " QuestionTags Id_a \\\n", + "0 philosophy, decision theory, mythology of ai, ... 1769.0 \n", + "1 machine learning, comparison, supervised learn... 10624.0 \n", + "2 philosophy, ethics, autonomous vehicles, decis... 1790.0 \n", + "3 philosophy, social, explainable ai 14247.0 \n", + "4 neural networks, deep learning, convolutional ... 4044.0 \n", + "\n", + " Answer AnswerScore \\\n", + "0 This classic problem exhibits a basic misunder... 146.0 \n", + "1 Introduction\\nThe term self-supervised learnin... 90.0 \n", + "2 \\nHow could self-driving cars make ethical dec... 76.0 \n", + "3 As argued by Selvaraju et al., there are three... 75.0 \n", + "4 There are many approaches that aim to make a t... 69.0 \n", + "\n", + " AcceptedAnswerFlag \n", + "0 True \n", + "1 True \n", + "2 True \n", + "3 True \n", + "4 True " ] }, + "execution_count": 221, "metadata": {}, - "execution_count": 221 + "output_type": "execute_result" } + ], + "source": [ + "questions = df[df.PostTypeId == 1]\n", + "answers = df[df.PostTypeId == 2]\n", + "\n", + "df = pd.merge(\n", + " questions,\n", + " answers[\n", + " [\n", + " \"Id\",\n", + " \"CreationDate\",\n", + " \"Score\",\n", + " \"ViewCount\",\n", + " \"CommentCount\",\n", + " \"ContentLicense\",\n", + " \"TagsClean\",\n", + " \"BodyClean\",\n", + " \"ParentId\",\n", + " ]\n", + " ],\n", + " left_on=\"Id\",\n", + " right_on=\"ParentId\",\n", + " suffixes=(\"_q\", \"_a\"),\n", + " how=\"left\",\n", + ")\n", + "\n", + "df[\"AcceptedAnswerFlag\"] = df.apply(lambda row: row[\"Id_a\"] == row[\"AcceptedAnswerId\"], axis=1)\n", + "\n", + "df = df.rename(\n", + " columns={\n", + " \"BodyClean_q\": \"Question\",\n", + " \"Score_q\": \"QuestionScore\",\n", + " \"TagsClean_q\": \"QuestionTags\",\n", + " \"BodyClean_a\": \"Answer\",\n", + " \"Score_a\": \"AnswerScore\",\n", + " \"ContentLicense_q\": \"QuestionContentLicense\",\n", + " \"ContentLicense_a\": \"AnswerContentLicense\",\n", + " \"CreationDate_q\": \"CreationDate\",\n", + " }\n", + ")\n", + "\n", + "## Set the number of results to a lower number to only return top N rated Answers.\n", + "number_of_results = 25\n", + "df = (\n", + " df.sort_values(by=[\"AcceptedAnswerFlag\", \"AnswerScore\"], ascending=[False, False])\n", + " .groupby(\"Question\")\n", + " .head(number_of_results)\n", + " .reset_index(drop=True)\n", + ")\n", + "\n", + "df[[\"Id_q\", \"Question\", \"QuestionScore\", \"QuestionTags\", \"Id_a\", \"Answer\", \"AnswerScore\", \"AcceptedAnswerFlag\"]].head()" ] }, { "cell_type": "code", - "source": [ - "testing_id = df.Id_q.mode()[0]\n", - "df[(df.Id_q == testing_id) | (df.ParentId_a == testing_id)][\n", - " [\"Id_q\", \"Question\", \"ParentId_a\", \"AcceptedAnswerId\", \"Id_a\", \"Answer\", \"AnswerScore\", \"AcceptedAnswerFlag\"]\n", - "]\n", - "# df[['Id_q', 'Question', 'ParentId_a', 'AcceptedAnswerId', 'Id_a', 'Answer', 'AnswerScore', 'AcceptedAnswerFlag']]" - ], + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1140,96 +1128,9 @@ "id": "eds1K8WL9QPo", "outputId": "bc526503-d6dd-428f-fa98-ad419d26a7dc" }, - "execution_count": null, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - " Id_q Question ParentId_a \\\n", - "7 15730 As a human being, we can think infinity. In pr... 15730.0 \n", - "3662 15730 As a human being, we can think infinity. In pr... 15730.0 \n", - "3713 15730 As a human being, we can think infinity. In pr... 15730.0 \n", - "3788 15730 As a human being, we can think infinity. In pr... 15730.0 \n", - "3821 15730 As a human being, we can think infinity. In pr... 15730.0 \n", - "3882 15730 As a human being, we can think infinity. In pr... 15730.0 \n", - "4389 15730 As a human being, we can think infinity. In pr... 15730.0 \n", - "4849 15730 As a human being, we can think infinity. In pr... 15730.0 \n", - "4850 15730 As a human being, we can think infinity. In pr... 15730.0 \n", - "5763 15730 As a human being, we can think infinity. In pr... 15730.0 \n", - "5764 15730 As a human being, we can think infinity. In pr... 15730.0 \n", - "5765 15730 As a human being, we can think infinity. In pr... 15730.0 \n", - "7462 15730 As a human being, we can think infinity. In pr... 15730.0 \n", - "7463 15730 As a human being, we can think infinity. In pr... 15730.0 \n", - "7464 15730 As a human being, we can think infinity. In pr... 15730.0 \n", - "7465 15730 As a human being, we can think infinity. In pr... 15730.0 \n", - "7466 15730 As a human being, we can think infinity. In pr... 15730.0 \n", - "7467 15730 As a human being, we can think infinity. In pr... 15730.0 \n", - "9481 15730 As a human being, we can think infinity. In pr... 15730.0 \n", - "\n", - " AcceptedAnswerId Id_a \\\n", - "7 15744 15744.0 \n", - "3662 15744 15753.0 \n", - "3713 15744 15747.0 \n", - "3788 15744 15756.0 \n", - "3821 15744 15758.0 \n", - "3882 15744 15762.0 \n", - "4389 15744 15783.0 \n", - "4849 15744 15740.0 \n", - "4850 15744 15803.0 \n", - "5763 15744 15768.0 \n", - "5764 15744 15810.0 \n", - "5765 15744 15943.0 \n", - "7462 15744 15779.0 \n", - "7463 15744 15787.0 \n", - "7464 15744 15801.0 \n", - "7465 15744 15930.0 \n", - "7466 15744 15934.0 \n", - "7467 15744 15938.0 \n", - "9481 15744 15931.0 \n", - "\n", - " Answer AnswerScore \\\n", - "7 I think this is a fairly common misconception ... 62.0 \n", - "3662 I think your premise is flawed.\\nYou seem to a... 19.0 \n", - "3713 TL;DR: The subtleties of infinity are made app... 12.0 \n", - "3788 In Haskell, you can type:\\nprint [1..]\\nand it... 9.0 \n", - "3821 I believe humans can be said to understand inf... 8.0 \n", - "3882 (There's a summary at the bottom for those who... 7.0 \n", - "4389 Then premise assumes that humans \"understand\" ... 4.0 \n", - "4849 By adding some rules for infinity in arithmeti... 3.0 \n", - "4850 I think the concept that is missing in the dis... 3.0 \n", - "5763 Computers don't understand \"infinity\" or even ... 2.0 \n", - "5764 The Questions That Computers Can Never Answer ... 2.0 \n", - "5765 John Doucette's answer covers my thoughts on t... 2.0 \n", - "7462 I would think that a computer couldn’t underst... 1.0 \n", - "7463 The \"concept\" of infinity is 1 thing to unders... 1.0 \n", - "7464 Just food for thought: how about if we try to ... 1.0 \n", - "7465 Its arguable if we humans understand infinity.... 1.0 \n", - "7466 Well -- just to touch on the question of peopl... 1.0 \n", - "7467 Humans certainly don't understand infinity. Cu... 1.0 \n", - "9481 I think the property humans have which compute... 0.0 \n", - "\n", - " AcceptedAnswerFlag \n", - "7 True \n", - "3662 False \n", - "3713 False \n", - "3788 False \n", - "3821 False \n", - "3882 False \n", - "4389 False \n", - "4849 False \n", - "4850 False \n", - "5763 False \n", - "5764 False \n", - "5765 False \n", - "7462 False \n", - "7463 False \n", - "7464 False \n", - "7465 False \n", - "7466 False \n", - "7467 False \n", - "9481 False " - ], "text/html": [ "\n", "
\n", @@ -1551,43 +1452,119 @@ "
\n", " \n", " " + ], + "text/plain": [ + " Id_q Question ParentId_a \\\n", + "7 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "3662 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "3713 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "3788 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "3821 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "3882 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "4389 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "4849 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "4850 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "5763 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "5764 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "5765 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "7462 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "7463 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "7464 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "7465 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "7466 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "7467 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "9481 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "\n", + " AcceptedAnswerId Id_a \\\n", + "7 15744 15744.0 \n", + "3662 15744 15753.0 \n", + "3713 15744 15747.0 \n", + "3788 15744 15756.0 \n", + "3821 15744 15758.0 \n", + "3882 15744 15762.0 \n", + "4389 15744 15783.0 \n", + "4849 15744 15740.0 \n", + "4850 15744 15803.0 \n", + "5763 15744 15768.0 \n", + "5764 15744 15810.0 \n", + "5765 15744 15943.0 \n", + "7462 15744 15779.0 \n", + "7463 15744 15787.0 \n", + "7464 15744 15801.0 \n", + "7465 15744 15930.0 \n", + "7466 15744 15934.0 \n", + "7467 15744 15938.0 \n", + "9481 15744 15931.0 \n", + "\n", + " Answer AnswerScore \\\n", + "7 I think this is a fairly common misconception ... 62.0 \n", + "3662 I think your premise is flawed.\\nYou seem to a... 19.0 \n", + "3713 TL;DR: The subtleties of infinity are made app... 12.0 \n", + "3788 In Haskell, you can type:\\nprint [1..]\\nand it... 9.0 \n", + "3821 I believe humans can be said to understand inf... 8.0 \n", + "3882 (There's a summary at the bottom for those who... 7.0 \n", + "4389 Then premise assumes that humans \"understand\" ... 4.0 \n", + "4849 By adding some rules for infinity in arithmeti... 3.0 \n", + "4850 I think the concept that is missing in the dis... 3.0 \n", + "5763 Computers don't understand \"infinity\" or even ... 2.0 \n", + "5764 The Questions That Computers Can Never Answer ... 2.0 \n", + "5765 John Doucette's answer covers my thoughts on t... 2.0 \n", + "7462 I would think that a computer couldn’t underst... 1.0 \n", + "7463 The \"concept\" of infinity is 1 thing to unders... 1.0 \n", + "7464 Just food for thought: how about if we try to ... 1.0 \n", + "7465 Its arguable if we humans understand infinity.... 1.0 \n", + "7466 Well -- just to touch on the question of peopl... 1.0 \n", + "7467 Humans certainly don't understand infinity. Cu... 1.0 \n", + "9481 I think the property humans have which compute... 0.0 \n", + "\n", + " AcceptedAnswerFlag \n", + "7 True \n", + "3662 False \n", + "3713 False \n", + "3788 False \n", + "3821 False \n", + "3882 False \n", + "4389 False \n", + "4849 False \n", + "4850 False \n", + "5763 False \n", + "5764 False \n", + "5765 False \n", + "7462 False \n", + "7463 False \n", + "7464 False \n", + "7465 False \n", + "7466 False \n", + "7467 False \n", + "9481 False " ] }, + "execution_count": 222, "metadata": {}, - "execution_count": 222 + "output_type": "execute_result" } + ], + "source": [ + "testing_id = df.Id_q.mode()[0]\n", + "df[(df.Id_q == testing_id) | (df.ParentId_a == testing_id)][\n", + " [\"Id_q\", \"Question\", \"ParentId_a\", \"AcceptedAnswerId\", \"Id_a\", \"Answer\", \"AnswerScore\", \"AcceptedAnswerFlag\"]\n", + "]\n", + "# df[['Id_q', 'Question', 'ParentId_a', 'AcceptedAnswerId', 'Id_a', 'Answer', 'AnswerScore', 'AcceptedAnswerFlag']]" ] }, { "cell_type": "markdown", + "metadata": { + "id": "gXgpXEO7DCbj" + }, "source": [ "# Create JSONL version of Dataframe\n", "This groups the dataframe by question data and creates nested list of Answers for that group. The entire list contains individual JSON objects, each representing a single question in the dataset with a key, Answers, which contains a list of dictionaries for each answer to the question." - ], - "metadata": { - "id": "gXgpXEO7DCbj" - } + ] }, { "cell_type": "code", - "source": [ - "j = (\n", - " df.groupby(\n", - " [\"Title\", \"Question\", \"QuestionScore\", \"QuestionTags\", \"QuestionContentLicense\", \"DataSource\", \"CreationDate\"]\n", - " )\n", - " .apply(lambda x: x[[\"Answer\", \"AnswerScore\", \"AcceptedAnswerFlag\"]].to_dict(\"records\"))\n", - " .reset_index()\n", - " .rename(columns={0: \"Answers\"})\n", - " .to_json(orient=\"records\")\n", - ")\n", - "\n", - "data = json.loads(j)\n", - "\n", - "for post in data:\n", - " if len(post.get(\"Answers\")) >= 4:\n", - " print(json.dumps(post, indent=4))\n", - " break" - ], + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1595,11 +1572,10 @@ "id": "OBR58MSRzAMP", "outputId": "c7da1e6c-3a97-465d-c9ba-7e055cb0d751" }, - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "{\n", " \"Title\": \"1 hidden layer with 1000 neurons vs. 10 hidden layers with 100 neurons\",\n", @@ -1634,21 +1610,56 @@ "}\n" ] } + ], + "source": [ + "j = (\n", + " df.groupby(\n", + " [\"Title\", \"Question\", \"QuestionScore\", \"QuestionTags\", \"QuestionContentLicense\", \"DataSource\", \"CreationDate\"]\n", + " )\n", + " .apply(lambda x: x[[\"Answer\", \"AnswerScore\", \"AcceptedAnswerFlag\"]].to_dict(\"records\"))\n", + " .reset_index()\n", + " .rename(columns={0: \"Answers\"})\n", + " .to_json(orient=\"records\")\n", + ")\n", + "\n", + "data = json.loads(j)\n", + "\n", + "for post in data:\n", + " if len(post.get(\"Answers\")) >= 4:\n", + " print(json.dumps(post, indent=4))\n", + " break" ] }, { "cell_type": "markdown", + "metadata": { + "id": "PlNjrpXaDm1_" + }, "source": [ "# Save file\n", "\n", "Files can be saved as JSON, JSONL, CSV, or Parquet" - ], - "metadata": { - "id": "PlNjrpXaDm1_" - } + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "CU0gWRGQDqIs", + "outputId": "9646e475-cedd-46f1-f9b8-7eb1fbc703c7" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data should be either of List type for JSON and JSONL, or Pandas Dataframes for CSV and Parquet\n" + ] + } + ], "source": [ "file_name = dataset_name\n", "\n", @@ -1685,40 +1696,28 @@ "\n", "# save_data(data=data, file_name=file_name, file_type='jsonl')\n", "# save_data(data=df, file_name=file_name, file_type='parquet')" - ], - "metadata": { - "id": "CU0gWRGQDqIs", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "9646e475-cedd-46f1-f9b8-7eb1fbc703c7" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Data should be either of List type for JSON and JSONL, or Pandas Dataframes for CSV and Parquet\n" - ] - } ] }, { "cell_type": "markdown", + "metadata": { + "id": "BdN3hKxtgH7f" + }, "source": [ "# Open-Assistant Data Scheme\n", "\n", "Testing putting the data into the Open-Assistant Data Scheme\n", "\n", "https://github.com/LAION-AI/Open-Assistant/blob/main/docs/data_schemas.md" - ], - "metadata": { - "id": "BdN3hKxtgH7f" - } + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "n8ubYQxegNSY" + }, + "outputs": [], "source": [ "from typing import TypeVar, List, Dict, Any, Literal\n", "from json import JSONEncoder\n", @@ -1754,15 +1753,53 @@ "class TreeEncoder(JSONEncoder):\n", " def default(self, o):\n", " return o.__dict__" - ], - "metadata": { - "id": "n8ubYQxegNSY" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "eE0fkytExSGl", + "outputId": "594632d6-f98c-49b8-af86-25f7f5e2ce06" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"root\": {\n", + " \"text\": \"Science Fiction has frequently shown AI to be a threat to the very existence of mankind. AI systems have often been the antagonists in many works of fiction, from 2001: A Space Odyssey through to The Terminator and beyond.\\nThe Media seems to buy into this trope as well. And in recent years we have had people like Elon Musk warn us of the dangers of an impending AI revolution, stating that AI is more dangerous than nukes.\\nAnd, apparently, experts think that we will be seeing this AI revolution in the next 100 years.\\nHowever, from my (albeit limited) study of AI, I get the impression that they are all wrong. I am going to outline my understanding below, please correct me if I am wrong:\\n\\nFirstly, all of these things seem to be confusing Artificial Intelligence with Artificial Consciousness. AI is essentially a system to make intelligent decisions, whereas AC is more like the \\\"self-aware\\\" systems that are shown in science fiction.\\n\\nNot AI itself, but intelligence and intelligent decision-making algorithms are something we've been working with and enhancing since before computers have been around. Moving this over to an artificial framework is fairly easy. However, consciousness is still something we are learning about. My guess is we won't be able to re-create something artificially if we barely understand how it works in the real world.\\n\\nSo, my conclusion is that no AI system will be able to learn enough to start thinking for itself, and that all our warnings of AI are completely unjustified.\\n\\nThe real danger comes from AC, which we are a long, long way from realizing because we are still a long way off from defining exactly what consciousness is, let alone understanding it.\\n\\n\\n\\nSo, my question is, assuming that my understanding is correct, are any efforts are being made by companies or organizations that work with AI to correct these popular misunderstandings in sci-fi, the media, and/or the public?\\nOr are the proponents of AI ambivalent towards this public fear-mongering?\\nI understand that the fear mongering is going to remain popular for some time, as bad news sells better than good news. I am just wondering if the general attitude from AI organizations is to ignore this popular misconception, or whether a concerted effort is being made to fight against these AI myths (but unfortunately nobody in the media is listening or cares).\\n\",\n", + " \"role\": \"prompter\",\n", + " \"children\": [\n", + " {\n", + " \"text\": \"Nothing. \\nIts in almost everyone's favor for it to stay that way financially. Having non-technical individuals associate AI with terminators makes a perception that the field has greater capabilities than it does $\\\\rightarrow$ this leads to grants, funding, etc... \\nIs there any negative? Yes. Misconceptions always have drawbacks. We see the creation of dumb ethics boards and such cough cough Elon Musk.\\nBut if history has anything to say about this, as the field gains popularity (which it is dnagerously quick), information will spread by definition, and eventually misconceptions will be laid to rest.\\nNote that this answer is biased and based upon my own opinions\\n\",\n", + " \"role\": \"assistant\",\n", + " \"children\": [],\n", + " \"metadata\": {\n", + " \"AnswerScore\": 2.0,\n", + " \"AcceptedAnswerFlag\": true\n", + " }\n", + " }\n", + " ],\n", + " \"metadata\": {\n", + " \"QuestionScore\": 5,\n", + " \"QuestionTags\": \"social, artificial consciousness\"\n", + " }\n", + " },\n", + " \"metadata\": {\n", + " \"Title\": \"\\\"AI will kill us all! The machines will rise up!\\\" - what is being done to dispel such myths?\",\n", + " \"QuestionContentLicense\": \"CC BY-SA 4.0\",\n", + " \"DataSource\": \"https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/ai.stackexchange.com.7z&file=Posts.xml\",\n", + " \"CreationDate\": \"2019-10-16T13:57:37.143\"\n", + " }\n", + "}\n" + ] + } + ], "source": [ "conversation_forest = []\n", "\n", @@ -1796,50 +1833,21 @@ "\n", "\n", "print(json.dumps(conversation_forest_json, indent=4), file=open(f\"/content/{file_name}.json\", \"w\"))" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "eE0fkytExSGl", - "outputId": "594632d6-f98c-49b8-af86-25f7f5e2ce06" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "{\n", - " \"root\": {\n", - " \"text\": \"Science Fiction has frequently shown AI to be a threat to the very existence of mankind. AI systems have often been the antagonists in many works of fiction, from 2001: A Space Odyssey through to The Terminator and beyond.\\nThe Media seems to buy into this trope as well. And in recent years we have had people like Elon Musk warn us of the dangers of an impending AI revolution, stating that AI is more dangerous than nukes.\\nAnd, apparently, experts think that we will be seeing this AI revolution in the next 100 years.\\nHowever, from my (albeit limited) study of AI, I get the impression that they are all wrong. I am going to outline my understanding below, please correct me if I am wrong:\\n\\nFirstly, all of these things seem to be confusing Artificial Intelligence with Artificial Consciousness. AI is essentially a system to make intelligent decisions, whereas AC is more like the \\\"self-aware\\\" systems that are shown in science fiction.\\n\\nNot AI itself, but intelligence and intelligent decision-making algorithms are something we've been working with and enhancing since before computers have been around. Moving this over to an artificial framework is fairly easy. However, consciousness is still something we are learning about. My guess is we won't be able to re-create something artificially if we barely understand how it works in the real world.\\n\\nSo, my conclusion is that no AI system will be able to learn enough to start thinking for itself, and that all our warnings of AI are completely unjustified.\\n\\nThe real danger comes from AC, which we are a long, long way from realizing because we are still a long way off from defining exactly what consciousness is, let alone understanding it.\\n\\n\\n\\nSo, my question is, assuming that my understanding is correct, are any efforts are being made by companies or organizations that work with AI to correct these popular misunderstandings in sci-fi, the media, and/or the public?\\nOr are the proponents of AI ambivalent towards this public fear-mongering?\\nI understand that the fear mongering is going to remain popular for some time, as bad news sells better than good news. I am just wondering if the general attitude from AI organizations is to ignore this popular misconception, or whether a concerted effort is being made to fight against these AI myths (but unfortunately nobody in the media is listening or cares).\\n\",\n", - " \"role\": \"prompter\",\n", - " \"children\": [\n", - " {\n", - " \"text\": \"Nothing. \\nIts in almost everyone's favor for it to stay that way financially. Having non-technical individuals associate AI with terminators makes a perception that the field has greater capabilities than it does $\\\\rightarrow$ this leads to grants, funding, etc... \\nIs there any negative? Yes. Misconceptions always have drawbacks. We see the creation of dumb ethics boards and such cough cough Elon Musk.\\nBut if history has anything to say about this, as the field gains popularity (which it is dnagerously quick), information will spread by definition, and eventually misconceptions will be laid to rest.\\nNote that this answer is biased and based upon my own opinions\\n\",\n", - " \"role\": \"assistant\",\n", - " \"children\": [],\n", - " \"metadata\": {\n", - " \"AnswerScore\": 2.0,\n", - " \"AcceptedAnswerFlag\": true\n", - " }\n", - " }\n", - " ],\n", - " \"metadata\": {\n", - " \"QuestionScore\": 5,\n", - " \"QuestionTags\": \"social, artificial consciousness\"\n", - " }\n", - " },\n", - " \"metadata\": {\n", - " \"Title\": \"\\\"AI will kill us all! The machines will rise up!\\\" - what is being done to dispel such myths?\",\n", - " \"QuestionContentLicense\": \"CC BY-SA 4.0\",\n", - " \"DataSource\": \"https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/ai.stackexchange.com.7z&file=Posts.xml\",\n", - " \"CreationDate\": \"2019-10-16T13:57:37.143\"\n", - " }\n", - "}\n" - ] - } ] } - ] + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/notebooks/detoxify-evaluation/DetoxityEvaluation.ipynb b/notebooks/detoxify-evaluation/DetoxityEvaluation.ipynb index 907c4365..5ec0f019 100644 --- a/notebooks/detoxify-evaluation/DetoxityEvaluation.ipynb +++ b/notebooks/detoxify-evaluation/DetoxityEvaluation.ipynb @@ -1,5 +1,23 @@ { "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/detoxify-evaluation/DetoxityEvaluation.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# uncomment below to install required python packages\n", + "#!pip install detoxify" + ] + }, { "attachments": {}, "cell_type": "markdown",