mirror of
https://github.com/wassname/Open-Assistant.git
synced 2026-06-27 16:10:30 +08:00
1846 lines
86 KiB
Plaintext
1846 lines
86 KiB
Plaintext
{
|
||
"nbformat": 4,
|
||
"nbformat_minor": 0,
|
||
"metadata": {
|
||
"colab": {
|
||
"provenance": []
|
||
},
|
||
"kernelspec": {
|
||
"name": "python3",
|
||
"display_name": "Python 3"
|
||
},
|
||
"language_info": {
|
||
"name": "python"
|
||
}
|
||
},
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"source": [
|
||
"# Ingest StackExchange data dumps\n",
|
||
"This notebook takes a StackExchange Data dump \"Posts.xml\" file and ingests it into a Pandas Dataframe. Outputs of the file can be JSON, JSONL, Parquet, or CSV. "
|
||
],
|
||
"metadata": {
|
||
"id": "TB7CEfs8F-8u"
|
||
}
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"id": "0rHryQttyzyY"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from bs4 import BeautifulSoup as bs\n",
|
||
"import pandas as pd\n",
|
||
"import requests\n",
|
||
"import json"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"source": [
|
||
"# Extract StackExchange\n",
|
||
"Pull StackExchange file dumps. Specific column types are enforced to prevent errors on processing later in the notebook"
|
||
],
|
||
"metadata": {
|
||
"id": "15mAL7GnzBv0"
|
||
}
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"source": [
|
||
"base_url = \"https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/{0}&file=Posts.xml\"\n",
|
||
"\n",
|
||
"\n",
|
||
"def get_all_filenames():\n",
|
||
" response = requests.get(\"https://archive.org/download/stackexchange\")\n",
|
||
" if response.ok:\n",
|
||
" soup = bs(response.content, \"html.parser\")\n",
|
||
" table = soup.find(\"table\")\n",
|
||
" link_tags = table.find_all(\"a\")\n",
|
||
" urls = {}\n",
|
||
" for link in link_tags:\n",
|
||
" url = link[\"href\"]\n",
|
||
" name = url.split(\".stackexchange\")[0].replace(\".\", \"_\").replace(\"-\", \"_\")\n",
|
||
" if url.endswith(\"7z\"):\n",
|
||
" urls[name] = base_url.format(url)\n",
|
||
" return urls\n",
|
||
"\n",
|
||
"\n",
|
||
"urls = get_all_filenames()\n",
|
||
"\n",
|
||
"print(urls.keys())\n",
|
||
"print(urls.get(\"ai\"))"
|
||
],
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "FtcvUEaHVxcW",
|
||
"outputId": "5b0cb19d-e3d9-422b-9077-52241bd09e0e"
|
||
},
|
||
"execution_count": null,
|
||
"outputs": [
|
||
{
|
||
"output_type": "stream",
|
||
"name": "stdout",
|
||
"text": [
|
||
"dict_keys(['3dprinting_meta', '3dprinting', 'Stackoverflow_com_Posts_7z', 'academia_meta', 'academia', 'ai_meta', 'ai', 'android_meta', 'android', 'anime_meta', 'anime', 'apple_meta', 'apple', 'arduino_meta', 'arduino', 'askubuntu_com_7z', 'astronomy_meta', 'astronomy', 'aviation_meta', 'aviation', 'avp_meta', 'avp', 'beer_meta', 'beer', 'bicycles_meta', 'bicycles', 'bioacoustics_meta', 'bioacoustics', 'bioinformatics_meta', 'bioinformatics', 'biology_meta', 'biology', 'bitcoin_meta', 'bitcoin', 'blender_meta', 'blender', 'boardgames_meta', 'boardgames', 'bricks_meta', 'bricks', 'buddhism_meta', 'buddhism', 'cardano_meta', 'cardano', 'chemistry_meta', 'chemistry', 'chess_meta', 'chess', 'chinese_meta', 'chinese', 'christianity_meta', 'christianity', 'civicrm_meta', 'civicrm', 'codegolf_meta', 'codegolf', 'codereview_meta', 'codereview', 'coffee_meta', 'coffee', 'cogsci_meta', 'cogsci', 'computergraphics_meta', 'computergraphics', 'conlang_meta', 'conlang', 'cooking_meta', 'cooking', 'craftcms_meta', 'craftcms', 'crafts_meta', 'crafts', 'crypto_meta', 'crypto', 'cs_meta', 'cs', 'cseducators_meta', 'cseducators', 'cstheory_meta', 'cstheory', 'datascience_meta', 'datascience', 'dba_meta', 'dba', 'devops_meta', 'devops', 'diy_meta', 'diy', 'drones_meta', 'drones', 'drupal_meta', 'drupal', 'dsp_meta', 'dsp', 'earthscience_meta', 'earthscience', 'ebooks_meta', 'ebooks', 'economics_meta', 'economics', 'electronics_meta', 'electronics', 'elementaryos_meta', 'elementaryos', 'ell_meta', 'ell', 'emacs_meta', 'emacs', 'engineering_meta', 'engineering', 'english_meta', 'english', 'eosio_meta', 'eosio', 'es_meta_stackoverflow_com_7z', 'es_stackoverflow_com_7z', 'esperanto_meta', 'esperanto', 'ethereum_meta', 'ethereum', 'expatriates_meta', 'expatriates', 'expressionengine_meta', 'expressionengine', 'fitness_meta', 'fitness', 'freelancing_meta', 'freelancing', 'french_meta', 'french', 'gamedev_meta', 'gamedev', 'gaming_meta', 'gaming', 'gardening_meta', 'gardening', 'genealogy_meta', 'genealogy', 'german_meta', 'german', 'gis_meta', 'gis', 'graphicdesign_meta', 'graphicdesign', 'ham_meta', 'ham', 'hardwarerecs_meta', 'hardwarerecs', 'health_meta', 'health', 'hermeneutics_meta', 'hermeneutics', 'hinduism_meta', 'hinduism', 'history_meta', 'history', 'homebrew_meta', 'homebrew', 'hsm_meta', 'hsm', 'interpersonal_meta', 'interpersonal', 'iot_meta', 'iot', 'iota_meta', 'iota', 'islam_meta', 'islam', 'italian_meta', 'italian', 'ja_meta_stackoverflow_com_7z', 'ja_stackoverflow_com_7z', 'japanese_meta', 'japanese', 'joomla_meta', 'joomla', 'judaism_meta', 'judaism', 'korean_meta', 'korean', 'languagelearning_meta', 'languagelearning', 'latin_meta', 'latin', 'law_meta', 'law', 'lifehacks_meta', 'lifehacks', 'linguistics_meta', 'linguistics', 'literature_meta', 'literature', 'magento_meta', 'magento', 'martialarts_meta', 'martialarts', 'materials_meta', 'materials', 'math_meta', 'math', 'matheducators_meta', 'matheducators', 'mathematica_meta', 'mathematica', 'mathoverflow_net_7z', 'mechanics_meta', 'mechanics', 'meta_askubuntu_com_7z', 'meta_mathoverflow_net_7z', 'meta_serverfault_com_7z', 'meta', 'meta_stackoverflow_com_7z', 'meta_superuser_com_7z', 'moderators_meta', 'moderators', 'monero_meta', 'monero', 'money_meta', 'money', 'movies_meta', 'movies', 'music_meta', 'music', 'musicfans_meta', 'musicfans', 'mythology_meta', 'mythology', 'networkengineering_meta', 'networkengineering', 'opendata_meta', 'opendata', 'opensource_meta', 'opensource', 'or_meta', 'or', 'outdoors_meta', 'outdoors', 'parenting_meta', 'parenting', 'patents_meta', 'patents', 'pets_meta', 'pets', 'philosophy_meta', 'philosophy', 'photo_meta', 'photo', 'physics_meta', 'physics', 'pm_meta', 'pm', 'poker_meta', 'poker', 'politics_meta', 'politics', 'portuguese_meta', 'portuguese', 'proofassistants_meta', 'proofassistants', 'pt_meta_stackoverflow_com_7z', 'pt_stackoverflow_com_7z', 'puzzling_meta', 'puzzling', 'quant_meta', 'quant', 'quantumcomputing_meta', 'quantumcomputing', 'raspberrypi_meta', 'raspberrypi', 'retrocomputing_meta', 'retrocomputing', 'reverseengineering_meta', 'reverseengineering', 'robotics_meta', 'robotics', 'rpg_meta', 'rpg', 'ru_meta_stackoverflow_com_7z', 'ru_stackoverflow_com_7z', 'rus_meta', 'rus', 'russian_meta', 'russian', 'salesforce_meta', 'salesforce', 'scicomp_meta', 'scicomp', 'scifi_meta', 'scifi', 'security_meta', 'security', 'serverfault_com_7z', 'sharepoint_meta', 'sharepoint', 'sitecore_meta', 'sitecore', 'skeptics_meta', 'skeptics', 'softwareengineering_meta', 'softwareengineering', 'softwarerecs_meta', 'softwarerecs', 'solana_meta', 'solana', 'sound_meta', 'sound', 'space_meta', 'space', 'spanish_meta', 'spanish', 'sports_meta', 'sports', 'sqa_meta', 'sqa', 'stackapps_com_7z', 'stackoverflow_com_Badges_7z', 'stackoverflow_com_Comments_7z', 'stackoverflow_com_PostHistory_7z', 'stackoverflow_com_PostLinks_7z', 'stackoverflow_com_Tags_7z', 'stackoverflow_com_Users_7z', 'stackoverflow_com_Votes_7z', 'stats_meta', 'stats', 'stellar_meta', 'stellar', 'substrate_meta', 'substrate', 'superuser_com_7z', 'sustainability_meta', 'sustainability', 'tex_meta', 'tex', 'tezos_meta', 'tezos', 'tor_meta', 'tor', 'travel_meta', 'travel', 'tridion_meta', 'tridion', 'ukrainian_meta', 'ukrainian', 'unix_meta', 'unix', 'ux_meta', 'ux', 'vegetarianism_meta', 'vegetarianism', 'vi_meta', 'vi', 'webapps_meta', 'webapps', 'webmasters_meta', 'webmasters', 'windowsphone_meta', 'windowsphone', 'woodworking_meta', 'woodworking', 'wordpress_meta', 'wordpress', 'workplace_meta', 'workplace', 'worldbuilding_meta', 'worldbuilding', 'writers_meta', 'writers'])\n",
|
||
"https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/ai.stackexchange.com.7z&file=Posts.xml\n"
|
||
]
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"source": [
|
||
"xml_format_map = {\n",
|
||
" \"Id\": int,\n",
|
||
" \"PostTypeId\": int,\n",
|
||
" \"CreationDate\": str,\n",
|
||
" \"Score\": int,\n",
|
||
" \"ViewCount\": int,\n",
|
||
" \"Body\": str,\n",
|
||
" \"AnswerCount\": int,\n",
|
||
" \"CommentCount\": int,\n",
|
||
" \"ContentLicense\": str,\n",
|
||
" \"AcceptedAnswerId\": int,\n",
|
||
" \"ParentId\": int,\n",
|
||
"}\n",
|
||
"\n",
|
||
"\n",
|
||
"# def extract_xml_file(file_url: str):\n",
|
||
"# table = pd.read_xml(file_url)\n",
|
||
"# return table\n",
|
||
"\n",
|
||
"\n",
|
||
"def xml_to_df(response: str):\n",
|
||
" \"\"\"\n",
|
||
" Collect and Manually import XML into Dataframe\n",
|
||
"\n",
|
||
" pd.read_xml() errors when XML trees are too large, this is just a hack to\n",
|
||
" download a XML file and parse into a Dataframe. **Not Tested on huge XML files**\n",
|
||
"\n",
|
||
" Parameters:\n",
|
||
" response (Requests.Response): Requests response object with the XML data\n",
|
||
"\n",
|
||
" Returns:\n",
|
||
" df (DataFrame): A Dataframe from the XML file\n",
|
||
" \"\"\"\n",
|
||
" soup = bs(response.content, \"xml\")\n",
|
||
" posts = soup.find_all(\"row\")\n",
|
||
"\n",
|
||
" all_posts = [post.attrs for post in posts]\n",
|
||
"\n",
|
||
" df = pd.DataFrame(all_posts)\n",
|
||
" df.AnswerCount.fillna(0, inplace=True)\n",
|
||
" df.ViewCount.fillna(0, inplace=True)\n",
|
||
" df.AcceptedAnswerId.fillna(0, inplace=True)\n",
|
||
" df.ParentId.fillna(0, inplace=True)\n",
|
||
" df[\"DataSource\"] = response.url\n",
|
||
" df = df.astype(xml_format_map)\n",
|
||
" return df\n",
|
||
"\n",
|
||
"\n",
|
||
"dataset_name = \"ai\"\n",
|
||
"\n",
|
||
"xml_posts_path = urls.get(dataset_name)\n",
|
||
"\n",
|
||
"\n",
|
||
"# df = extract_xml_file(test)\n",
|
||
"response = requests.get(xml_posts_path)\n",
|
||
"df = xml_to_df(response)\n",
|
||
"\n",
|
||
"\n",
|
||
"print(df.dtypes)\n",
|
||
"df.head()"
|
||
],
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/",
|
||
"height": 907
|
||
},
|
||
"id": "-t27RnxdzBYB",
|
||
"outputId": "5ec0ceed-c82b-48fa-facd-41b4aae2f9e6"
|
||
},
|
||
"execution_count": null,
|
||
"outputs": [
|
||
{
|
||
"output_type": "stream",
|
||
"name": "stdout",
|
||
"text": [
|
||
"Id int64\n",
|
||
"PostTypeId int64\n",
|
||
"AcceptedAnswerId int64\n",
|
||
"CreationDate object\n",
|
||
"Score int64\n",
|
||
"ViewCount int64\n",
|
||
"Body object\n",
|
||
"OwnerUserId object\n",
|
||
"LastEditorUserId object\n",
|
||
"LastEditDate object\n",
|
||
"LastActivityDate object\n",
|
||
"Title object\n",
|
||
"Tags object\n",
|
||
"AnswerCount int64\n",
|
||
"CommentCount int64\n",
|
||
"ContentLicense object\n",
|
||
"ParentId int64\n",
|
||
"ClosedDate object\n",
|
||
"FavoriteCount object\n",
|
||
"CommunityOwnedDate object\n",
|
||
"LastEditorDisplayName object\n",
|
||
"OwnerDisplayName object\n",
|
||
"DataSource object\n",
|
||
"dtype: object\n"
|
||
]
|
||
},
|
||
{
|
||
"output_type": "execute_result",
|
||
"data": {
|
||
"text/plain": [
|
||
" Id PostTypeId AcceptedAnswerId CreationDate Score \\\n",
|
||
"0 1 1 3 2016-08-02T15:39:14.947 10 \n",
|
||
"1 2 1 9 2016-08-02T15:40:20.623 14 \n",
|
||
"2 3 2 0 2016-08-02T15:40:24.820 15 \n",
|
||
"3 4 1 12 2016-08-02T15:41:22.020 33 \n",
|
||
"4 6 1 20 2016-08-02T15:43:35.460 7 \n",
|
||
"\n",
|
||
" ViewCount Body OwnerUserId \\\n",
|
||
"0 710 <p>What does \"backprop\" mean? Is the \"backprop... 8 \n",
|
||
"1 1008 <p>Does increasing the noise in data help to i... 8 \n",
|
||
"2 0 <p>\"Backprop\" is the same as \"backpropagation\"... 4 \n",
|
||
"3 1266 <p>When you're writing your algorithm, how do ... 8 \n",
|
||
"4 279 <p>Given the following definition of an intell... 29 \n",
|
||
"\n",
|
||
" LastEditorUserId LastEditDate ... AnswerCount CommentCount \\\n",
|
||
"0 2444 2019-11-16T17:56:22.093 ... 5 0 \n",
|
||
"1 2444 2019-02-23T22:36:19.090 ... 3 0 \n",
|
||
"2 NaN NaN ... 0 0 \n",
|
||
"3 2444 2021-01-19T23:54:07.813 ... 4 0 \n",
|
||
"4 2444 2019-06-15T18:25:58.513 ... 2 0 \n",
|
||
"\n",
|
||
" ContentLicense ParentId ClosedDate FavoriteCount CommunityOwnedDate \\\n",
|
||
"0 CC BY-SA 4.0 0 NaN NaN NaN \n",
|
||
"1 CC BY-SA 4.0 0 NaN NaN NaN \n",
|
||
"2 CC BY-SA 3.0 1 NaN NaN NaN \n",
|
||
"3 CC BY-SA 3.0 0 NaN NaN NaN \n",
|
||
"4 CC BY-SA 4.0 0 NaN NaN NaN \n",
|
||
"\n",
|
||
" LastEditorDisplayName OwnerDisplayName \\\n",
|
||
"0 NaN NaN \n",
|
||
"1 NaN NaN \n",
|
||
"2 NaN NaN \n",
|
||
"3 NaN NaN \n",
|
||
"4 NaN NaN \n",
|
||
"\n",
|
||
" DataSource \n",
|
||
"0 https://ia600107.us.archive.org/view_archive.p... \n",
|
||
"1 https://ia600107.us.archive.org/view_archive.p... \n",
|
||
"2 https://ia600107.us.archive.org/view_archive.p... \n",
|
||
"3 https://ia600107.us.archive.org/view_archive.p... \n",
|
||
"4 https://ia600107.us.archive.org/view_archive.p... \n",
|
||
"\n",
|
||
"[5 rows x 23 columns]"
|
||
],
|
||
"text/html": [
|
||
"\n",
|
||
" <div id=\"df-776df830-b974-4f15-9190-53bcb8e84bf8\">\n",
|
||
" <div class=\"colab-df-container\">\n",
|
||
" <div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Id</th>\n",
|
||
" <th>PostTypeId</th>\n",
|
||
" <th>AcceptedAnswerId</th>\n",
|
||
" <th>CreationDate</th>\n",
|
||
" <th>Score</th>\n",
|
||
" <th>ViewCount</th>\n",
|
||
" <th>Body</th>\n",
|
||
" <th>OwnerUserId</th>\n",
|
||
" <th>LastEditorUserId</th>\n",
|
||
" <th>LastEditDate</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>AnswerCount</th>\n",
|
||
" <th>CommentCount</th>\n",
|
||
" <th>ContentLicense</th>\n",
|
||
" <th>ParentId</th>\n",
|
||
" <th>ClosedDate</th>\n",
|
||
" <th>FavoriteCount</th>\n",
|
||
" <th>CommunityOwnedDate</th>\n",
|
||
" <th>LastEditorDisplayName</th>\n",
|
||
" <th>OwnerDisplayName</th>\n",
|
||
" <th>DataSource</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2016-08-02T15:39:14.947</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>710</td>\n",
|
||
" <td><p>What does \"backprop\" mean? Is the \"backprop...</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>2444</td>\n",
|
||
" <td>2019-11-16T17:56:22.093</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>CC BY-SA 4.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>https://ia600107.us.archive.org/view_archive.p...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>9</td>\n",
|
||
" <td>2016-08-02T15:40:20.623</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>1008</td>\n",
|
||
" <td><p>Does increasing the noise in data help to i...</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>2444</td>\n",
|
||
" <td>2019-02-23T22:36:19.090</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>CC BY-SA 4.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>https://ia600107.us.archive.org/view_archive.p...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2016-08-02T15:40:24.820</td>\n",
|
||
" <td>15</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td><p>\"Backprop\" is the same as \"backpropagation\"...</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>CC BY-SA 3.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>https://ia600107.us.archive.org/view_archive.p...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>12</td>\n",
|
||
" <td>2016-08-02T15:41:22.020</td>\n",
|
||
" <td>33</td>\n",
|
||
" <td>1266</td>\n",
|
||
" <td><p>When you're writing your algorithm, how do ...</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>2444</td>\n",
|
||
" <td>2021-01-19T23:54:07.813</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>CC BY-SA 3.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>https://ia600107.us.archive.org/view_archive.p...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>6</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>20</td>\n",
|
||
" <td>2016-08-02T15:43:35.460</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>279</td>\n",
|
||
" <td><p>Given the following definition of an intell...</td>\n",
|
||
" <td>29</td>\n",
|
||
" <td>2444</td>\n",
|
||
" <td>2019-06-15T18:25:58.513</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>CC BY-SA 4.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>https://ia600107.us.archive.org/view_archive.p...</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 23 columns</p>\n",
|
||
"</div>\n",
|
||
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-776df830-b974-4f15-9190-53bcb8e84bf8')\"\n",
|
||
" title=\"Convert this dataframe to an interactive table.\"\n",
|
||
" style=\"display:none;\">\n",
|
||
" \n",
|
||
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
|
||
" width=\"24px\">\n",
|
||
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
|
||
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
|
||
" </svg>\n",
|
||
" </button>\n",
|
||
" \n",
|
||
" <style>\n",
|
||
" .colab-df-container {\n",
|
||
" display:flex;\n",
|
||
" flex-wrap:wrap;\n",
|
||
" gap: 12px;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .colab-df-convert {\n",
|
||
" background-color: #E8F0FE;\n",
|
||
" border: none;\n",
|
||
" border-radius: 50%;\n",
|
||
" cursor: pointer;\n",
|
||
" display: none;\n",
|
||
" fill: #1967D2;\n",
|
||
" height: 32px;\n",
|
||
" padding: 0 0 0 0;\n",
|
||
" width: 32px;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .colab-df-convert:hover {\n",
|
||
" background-color: #E2EBFA;\n",
|
||
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
|
||
" fill: #174EA6;\n",
|
||
" }\n",
|
||
"\n",
|
||
" [theme=dark] .colab-df-convert {\n",
|
||
" background-color: #3B4455;\n",
|
||
" fill: #D2E3FC;\n",
|
||
" }\n",
|
||
"\n",
|
||
" [theme=dark] .colab-df-convert:hover {\n",
|
||
" background-color: #434B5C;\n",
|
||
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
|
||
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
|
||
" fill: #FFFFFF;\n",
|
||
" }\n",
|
||
" </style>\n",
|
||
"\n",
|
||
" <script>\n",
|
||
" const buttonEl =\n",
|
||
" document.querySelector('#df-776df830-b974-4f15-9190-53bcb8e84bf8 button.colab-df-convert');\n",
|
||
" buttonEl.style.display =\n",
|
||
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
|
||
"\n",
|
||
" async function convertToInteractive(key) {\n",
|
||
" const element = document.querySelector('#df-776df830-b974-4f15-9190-53bcb8e84bf8');\n",
|
||
" const dataTable =\n",
|
||
" await google.colab.kernel.invokeFunction('convertToInteractive',\n",
|
||
" [key], {});\n",
|
||
" if (!dataTable) return;\n",
|
||
"\n",
|
||
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
|
||
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
|
||
" + ' to learn more about interactive tables.';\n",
|
||
" element.innerHTML = '';\n",
|
||
" dataTable['output_type'] = 'display_data';\n",
|
||
" await google.colab.output.renderOutput(dataTable, element);\n",
|
||
" const docLink = document.createElement('div');\n",
|
||
" docLink.innerHTML = docLinkHtml;\n",
|
||
" element.appendChild(docLink);\n",
|
||
" }\n",
|
||
" </script>\n",
|
||
" </div>\n",
|
||
" </div>\n",
|
||
" "
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"execution_count": 219
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"source": [
|
||
"# Transformations"
|
||
],
|
||
"metadata": {
|
||
"id": "RAzTR7zY3oan"
|
||
}
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"source": [
|
||
"def filter_only_questions_with_accepted_answers(df):\n",
|
||
" \"\"\"**TODO**\n",
|
||
" Filter only to Questions with Accepted Answers\n",
|
||
"\n",
|
||
" Filter dataframe by questions that have accepted answers, should also include\n",
|
||
" all rows of answers for those questions, even if not accepted.\n",
|
||
"\n",
|
||
" Parameters:\n",
|
||
" df (DataFrame): containing a \"AcceptedAnswerId\", \"Id\", and \"ParentId\" columns\n",
|
||
"\n",
|
||
" Returns:\n",
|
||
" df (DataFrame): current dataframe with filtered results\n",
|
||
" \"\"\"\n",
|
||
" df = df[(df[\"AcceptedAnswerId\"].notnull()) | (df[\"ParentId\"] == df[\"Id\"])]\n",
|
||
"\n",
|
||
"\n",
|
||
"def filter_scores_above(df, question_score_threshold: int = 20, answer_score_threshold: int = 20):\n",
|
||
" \"\"\"**TODO**\n",
|
||
" Filter Dataframe by minimum scores\n",
|
||
"\n",
|
||
" Filter Question and Answer columns by score thresholds to trim lower scoring results\n",
|
||
"\n",
|
||
" Parameters:\n",
|
||
" df (DataFrame): containing a \"Score\" column\n",
|
||
"\n",
|
||
" Returns:\n",
|
||
" df (DataFrame): current dataframe with filtered results\n",
|
||
" \"\"\"\n",
|
||
" df = df[\n",
|
||
" ((df[\"Score\"] >= question_score_threshold) & (df.PostTypeId == 1))\n",
|
||
" | ((df[\"Score\"] >= answer_score_threshold) & (df.PostTypeId == 2))\n",
|
||
" ]\n",
|
||
"\n",
|
||
"\n",
|
||
"def convert_html_to_text(df, column: str = \"Body\"):\n",
|
||
" \"\"\"\n",
|
||
" Convert HTML tags to pure text\n",
|
||
"\n",
|
||
" Feeds HTML text body into BeautifulSoup to parse it to only text. Set aside as\n",
|
||
" function to provide option to skip\n",
|
||
"\n",
|
||
" Parameters:\n",
|
||
" df (DataFrame): containing a \"Body\" column with HTML\n",
|
||
"\n",
|
||
" Returns:\n",
|
||
" df (DataFrame): current dataframe with parsed column\n",
|
||
" \"\"\"\n",
|
||
" df.dropna(subset=[column], inplace=True)\n",
|
||
" df[f\"{column}Clean\"] = df[column].apply(lambda row: bs(row, \"html.parser\").text)\n",
|
||
"\n",
|
||
"\n",
|
||
"def clean_tags(df):\n",
|
||
" \"\"\"\n",
|
||
" Convert Tags into Comma separated\n",
|
||
"\n",
|
||
" Converts Tag slugs into commas separated tags\n",
|
||
"\n",
|
||
" Parameters:\n",
|
||
" df (DataFrame): containing a \"Tags\" column with slugs\n",
|
||
"\n",
|
||
" Returns:\n",
|
||
" df (DataFrame): current dataframe with parsed column\n",
|
||
" \"\"\"\n",
|
||
" df[\"TagsClean\"] = df[\"Tags\"].str.replace(\"-\", \" \").str.replace(\"><\", \", \").str.replace(\"<\", \"\").str.replace(\">\", \"\")\n",
|
||
"\n",
|
||
"\n",
|
||
"# filter_only_questions_with_accepted_answers(df)\n",
|
||
"# filter_scores_above(df)\n",
|
||
"convert_html_to_text(df)\n",
|
||
"clean_tags(df)\n",
|
||
"\n",
|
||
"df[[\"Body\", \"BodyClean\", \"Tags\", \"TagsClean\"]]\n",
|
||
"# print(df.shape)"
|
||
],
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/",
|
||
"height": 424
|
||
},
|
||
"id": "qyUqc31Z3Z9g",
|
||
"outputId": "18dce8b4-af26-49c9-ee73-6c677177b516"
|
||
},
|
||
"execution_count": null,
|
||
"outputs": [
|
||
{
|
||
"output_type": "execute_result",
|
||
"data": {
|
||
"text/plain": [
|
||
" Body \\\n",
|
||
"0 <p>What does \"backprop\" mean? Is the \"backprop... \n",
|
||
"1 <p>Does increasing the noise in data help to i... \n",
|
||
"2 <p>\"Backprop\" is the same as \"backpropagation\"... \n",
|
||
"3 <p>When you're writing your algorithm, how do ... \n",
|
||
"4 <p>Given the following definition of an intell... \n",
|
||
"... ... \n",
|
||
"23174 <p>The purpose of evaluating the state and act... \n",
|
||
"23175 <p>In machine translation, convolution is a te... \n",
|
||
"23176 <p>One of the key features of ChatGPT is its a... \n",
|
||
"23177 <p>Given a neural network model for Covid-19 c... \n",
|
||
"23178 <p>My question is more related to the fundamen... \n",
|
||
"\n",
|
||
" BodyClean \\\n",
|
||
"0 What does \"backprop\" mean? Is the \"backprop\" t... \n",
|
||
"1 Does increasing the noise in data help to impr... \n",
|
||
"2 \"Backprop\" is the same as \"backpropagation\": i... \n",
|
||
"3 When you're writing your algorithm, how do you... \n",
|
||
"4 Given the following definition of an intellige... \n",
|
||
"... ... \n",
|
||
"23174 The purpose of evaluating the state and action... \n",
|
||
"23175 In machine translation, convolution is a techn... \n",
|
||
"23176 One of the key features of ChatGPT is its abil... \n",
|
||
"23177 Given a neural network model for Covid-19 clas... \n",
|
||
"23178 My question is more related to the fundamental... \n",
|
||
"\n",
|
||
" Tags \\\n",
|
||
"0 <neural-networks><backpropagation><terminology... \n",
|
||
"1 <neural-networks><machine-learning><statistica... \n",
|
||
"2 NaN \n",
|
||
"3 <neural-networks><hyperparameter-optimization>... \n",
|
||
"4 <philosophy><definitions><intelligent-agent> \n",
|
||
"... ... \n",
|
||
"23174 NaN \n",
|
||
"23175 NaN \n",
|
||
"23176 NaN \n",
|
||
"23177 <neural-networks><homework> \n",
|
||
"23178 <search><constraint-satisfaction-problems> \n",
|
||
"\n",
|
||
" TagsClean \n",
|
||
"0 neural networks, backpropagation, terminology,... \n",
|
||
"1 neural networks, machine learning, statistical... \n",
|
||
"2 NaN \n",
|
||
"3 neural networks, hyperparameter optimization, ... \n",
|
||
"4 philosophy, definitions, intelligent agent \n",
|
||
"... ... \n",
|
||
"23174 NaN \n",
|
||
"23175 NaN \n",
|
||
"23176 NaN \n",
|
||
"23177 neural networks, homework \n",
|
||
"23178 search, constraint satisfaction problems \n",
|
||
"\n",
|
||
"[23179 rows x 4 columns]"
|
||
],
|
||
"text/html": [
|
||
"\n",
|
||
" <div id=\"df-c809ce1f-6807-4dfd-97c9-38d47afa28d7\">\n",
|
||
" <div class=\"colab-df-container\">\n",
|
||
" <div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Body</th>\n",
|
||
" <th>BodyClean</th>\n",
|
||
" <th>Tags</th>\n",
|
||
" <th>TagsClean</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td><p>What does \"backprop\" mean? Is the \"backprop...</td>\n",
|
||
" <td>What does \"backprop\" mean? Is the \"backprop\" t...</td>\n",
|
||
" <td><neural-networks><backpropagation><terminology...</td>\n",
|
||
" <td>neural networks, backpropagation, terminology,...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td><p>Does increasing the noise in data help to i...</td>\n",
|
||
" <td>Does increasing the noise in data help to impr...</td>\n",
|
||
" <td><neural-networks><machine-learning><statistica...</td>\n",
|
||
" <td>neural networks, machine learning, statistical...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td><p>\"Backprop\" is the same as \"backpropagation\"...</td>\n",
|
||
" <td>\"Backprop\" is the same as \"backpropagation\": i...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td><p>When you're writing your algorithm, how do ...</td>\n",
|
||
" <td>When you're writing your algorithm, how do you...</td>\n",
|
||
" <td><neural-networks><hyperparameter-optimization>...</td>\n",
|
||
" <td>neural networks, hyperparameter optimization, ...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td><p>Given the following definition of an intell...</td>\n",
|
||
" <td>Given the following definition of an intellige...</td>\n",
|
||
" <td><philosophy><definitions><intelligent-agent></td>\n",
|
||
" <td>philosophy, definitions, intelligent agent</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>23174</th>\n",
|
||
" <td><p>The purpose of evaluating the state and act...</td>\n",
|
||
" <td>The purpose of evaluating the state and action...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>23175</th>\n",
|
||
" <td><p>In machine translation, convolution is a te...</td>\n",
|
||
" <td>In machine translation, convolution is a techn...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>23176</th>\n",
|
||
" <td><p>One of the key features of ChatGPT is its a...</td>\n",
|
||
" <td>One of the key features of ChatGPT is its abil...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>23177</th>\n",
|
||
" <td><p>Given a neural network model for Covid-19 c...</td>\n",
|
||
" <td>Given a neural network model for Covid-19 clas...</td>\n",
|
||
" <td><neural-networks><homework></td>\n",
|
||
" <td>neural networks, homework</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>23178</th>\n",
|
||
" <td><p>My question is more related to the fundamen...</td>\n",
|
||
" <td>My question is more related to the fundamental...</td>\n",
|
||
" <td><search><constraint-satisfaction-problems></td>\n",
|
||
" <td>search, constraint satisfaction problems</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>23179 rows × 4 columns</p>\n",
|
||
"</div>\n",
|
||
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-c809ce1f-6807-4dfd-97c9-38d47afa28d7')\"\n",
|
||
" title=\"Convert this dataframe to an interactive table.\"\n",
|
||
" style=\"display:none;\">\n",
|
||
" \n",
|
||
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
|
||
" width=\"24px\">\n",
|
||
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
|
||
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
|
||
" </svg>\n",
|
||
" </button>\n",
|
||
" \n",
|
||
" <style>\n",
|
||
" .colab-df-container {\n",
|
||
" display:flex;\n",
|
||
" flex-wrap:wrap;\n",
|
||
" gap: 12px;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .colab-df-convert {\n",
|
||
" background-color: #E8F0FE;\n",
|
||
" border: none;\n",
|
||
" border-radius: 50%;\n",
|
||
" cursor: pointer;\n",
|
||
" display: none;\n",
|
||
" fill: #1967D2;\n",
|
||
" height: 32px;\n",
|
||
" padding: 0 0 0 0;\n",
|
||
" width: 32px;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .colab-df-convert:hover {\n",
|
||
" background-color: #E2EBFA;\n",
|
||
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
|
||
" fill: #174EA6;\n",
|
||
" }\n",
|
||
"\n",
|
||
" [theme=dark] .colab-df-convert {\n",
|
||
" background-color: #3B4455;\n",
|
||
" fill: #D2E3FC;\n",
|
||
" }\n",
|
||
"\n",
|
||
" [theme=dark] .colab-df-convert:hover {\n",
|
||
" background-color: #434B5C;\n",
|
||
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
|
||
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
|
||
" fill: #FFFFFF;\n",
|
||
" }\n",
|
||
" </style>\n",
|
||
"\n",
|
||
" <script>\n",
|
||
" const buttonEl =\n",
|
||
" document.querySelector('#df-c809ce1f-6807-4dfd-97c9-38d47afa28d7 button.colab-df-convert');\n",
|
||
" buttonEl.style.display =\n",
|
||
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
|
||
"\n",
|
||
" async function convertToInteractive(key) {\n",
|
||
" const element = document.querySelector('#df-c809ce1f-6807-4dfd-97c9-38d47afa28d7');\n",
|
||
" const dataTable =\n",
|
||
" await google.colab.kernel.invokeFunction('convertToInteractive',\n",
|
||
" [key], {});\n",
|
||
" if (!dataTable) return;\n",
|
||
"\n",
|
||
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
|
||
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
|
||
" + ' to learn more about interactive tables.';\n",
|
||
" element.innerHTML = '';\n",
|
||
" dataTable['output_type'] = 'display_data';\n",
|
||
" await google.colab.output.renderOutput(dataTable, element);\n",
|
||
" const docLink = document.createElement('div');\n",
|
||
" docLink.innerHTML = docLinkHtml;\n",
|
||
" element.appendChild(docLink);\n",
|
||
" }\n",
|
||
" </script>\n",
|
||
" </div>\n",
|
||
" </div>\n",
|
||
" "
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"execution_count": 220
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"source": [
|
||
"This groups questions with answers so that a row with a question also has a column with an answer. It then creates an AcceptedAnswerFlag column that is True if the answer was accepted by the person who asked the question. Changing the `number_of_results` variable will limit the number of answers you want to keep."
|
||
],
|
||
"metadata": {
|
||
"id": "C09Bwdw-44PZ"
|
||
}
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"source": [
|
||
"questions = df[df.PostTypeId == 1]\n",
|
||
"answers = df[df.PostTypeId == 2]\n",
|
||
"\n",
|
||
"df = pd.merge(\n",
|
||
" questions,\n",
|
||
" answers[\n",
|
||
" [\n",
|
||
" \"Id\",\n",
|
||
" \"CreationDate\",\n",
|
||
" \"Score\",\n",
|
||
" \"ViewCount\",\n",
|
||
" \"CommentCount\",\n",
|
||
" \"ContentLicense\",\n",
|
||
" \"TagsClean\",\n",
|
||
" \"BodyClean\",\n",
|
||
" \"ParentId\",\n",
|
||
" ]\n",
|
||
" ],\n",
|
||
" left_on=\"Id\",\n",
|
||
" right_on=\"ParentId\",\n",
|
||
" suffixes=(\"_q\", \"_a\"),\n",
|
||
" how=\"left\",\n",
|
||
")\n",
|
||
"\n",
|
||
"df[\"AcceptedAnswerFlag\"] = df.apply(lambda row: row[\"Id_a\"] == row[\"AcceptedAnswerId\"], axis=1)\n",
|
||
"\n",
|
||
"df = df.rename(\n",
|
||
" columns={\n",
|
||
" \"BodyClean_q\": \"Question\",\n",
|
||
" \"Score_q\": \"QuestionScore\",\n",
|
||
" \"TagsClean_q\": \"QuestionTags\",\n",
|
||
" \"BodyClean_a\": \"Answer\",\n",
|
||
" \"Score_a\": \"AnswerScore\",\n",
|
||
" \"ContentLicense_q\": \"QuestionContentLicense\",\n",
|
||
" \"ContentLicense_a\": \"AnswerContentLicense\",\n",
|
||
" \"CreationDate_q\": \"CreationDate\",\n",
|
||
" }\n",
|
||
")\n",
|
||
"\n",
|
||
"## Set the number of results to a lower number to only return top N rated Answers.\n",
|
||
"number_of_results = 25\n",
|
||
"df = (\n",
|
||
" df.sort_values(by=[\"AcceptedAnswerFlag\", \"AnswerScore\"], ascending=[False, False])\n",
|
||
" .groupby(\"Question\")\n",
|
||
" .head(number_of_results)\n",
|
||
" .reset_index(drop=True)\n",
|
||
")\n",
|
||
"\n",
|
||
"df[[\"Id_q\", \"Question\", \"QuestionScore\", \"QuestionTags\", \"Id_a\", \"Answer\", \"AnswerScore\", \"AcceptedAnswerFlag\"]].head()"
|
||
],
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/",
|
||
"height": 337
|
||
},
|
||
"id": "Bgz2fZ9k43Ab",
|
||
"outputId": "28896d69-03cd-4877-fdfb-ae48dafa4ff3"
|
||
},
|
||
"execution_count": null,
|
||
"outputs": [
|
||
{
|
||
"output_type": "execute_result",
|
||
"data": {
|
||
"text/plain": [
|
||
" Id_q Question QuestionScore \\\n",
|
||
"0 1768 In Portal 2 we see that AI's can be \"killed\" b... 175 \n",
|
||
"1 10623 What is self-supervised learning in machine le... 91 \n",
|
||
"2 111 Obviously, self-driving cars aren't perfect, s... 100 \n",
|
||
"3 14224 If the original purpose for developing AI was ... 69 \n",
|
||
"4 1479 Do scientists or research experts know from th... 94 \n",
|
||
"\n",
|
||
" QuestionTags Id_a \\\n",
|
||
"0 philosophy, decision theory, mythology of ai, ... 1769.0 \n",
|
||
"1 machine learning, comparison, supervised learn... 10624.0 \n",
|
||
"2 philosophy, ethics, autonomous vehicles, decis... 1790.0 \n",
|
||
"3 philosophy, social, explainable ai 14247.0 \n",
|
||
"4 neural networks, deep learning, convolutional ... 4044.0 \n",
|
||
"\n",
|
||
" Answer AnswerScore \\\n",
|
||
"0 This classic problem exhibits a basic misunder... 146.0 \n",
|
||
"1 Introduction\\nThe term self-supervised learnin... 90.0 \n",
|
||
"2 \\nHow could self-driving cars make ethical dec... 76.0 \n",
|
||
"3 As argued by Selvaraju et al., there are three... 75.0 \n",
|
||
"4 There are many approaches that aim to make a t... 69.0 \n",
|
||
"\n",
|
||
" AcceptedAnswerFlag \n",
|
||
"0 True \n",
|
||
"1 True \n",
|
||
"2 True \n",
|
||
"3 True \n",
|
||
"4 True "
|
||
],
|
||
"text/html": [
|
||
"\n",
|
||
" <div id=\"df-8ac2298f-ac6d-46f5-aa1d-41dec7fe27b5\">\n",
|
||
" <div class=\"colab-df-container\">\n",
|
||
" <div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Id_q</th>\n",
|
||
" <th>Question</th>\n",
|
||
" <th>QuestionScore</th>\n",
|
||
" <th>QuestionTags</th>\n",
|
||
" <th>Id_a</th>\n",
|
||
" <th>Answer</th>\n",
|
||
" <th>AnswerScore</th>\n",
|
||
" <th>AcceptedAnswerFlag</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1768</td>\n",
|
||
" <td>In Portal 2 we see that AI's can be \"killed\" b...</td>\n",
|
||
" <td>175</td>\n",
|
||
" <td>philosophy, decision theory, mythology of ai, ...</td>\n",
|
||
" <td>1769.0</td>\n",
|
||
" <td>This classic problem exhibits a basic misunder...</td>\n",
|
||
" <td>146.0</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>10623</td>\n",
|
||
" <td>What is self-supervised learning in machine le...</td>\n",
|
||
" <td>91</td>\n",
|
||
" <td>machine learning, comparison, supervised learn...</td>\n",
|
||
" <td>10624.0</td>\n",
|
||
" <td>Introduction\\nThe term self-supervised learnin...</td>\n",
|
||
" <td>90.0</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>111</td>\n",
|
||
" <td>Obviously, self-driving cars aren't perfect, s...</td>\n",
|
||
" <td>100</td>\n",
|
||
" <td>philosophy, ethics, autonomous vehicles, decis...</td>\n",
|
||
" <td>1790.0</td>\n",
|
||
" <td>\\nHow could self-driving cars make ethical dec...</td>\n",
|
||
" <td>76.0</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>14224</td>\n",
|
||
" <td>If the original purpose for developing AI was ...</td>\n",
|
||
" <td>69</td>\n",
|
||
" <td>philosophy, social, explainable ai</td>\n",
|
||
" <td>14247.0</td>\n",
|
||
" <td>As argued by Selvaraju et al., there are three...</td>\n",
|
||
" <td>75.0</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1479</td>\n",
|
||
" <td>Do scientists or research experts know from th...</td>\n",
|
||
" <td>94</td>\n",
|
||
" <td>neural networks, deep learning, convolutional ...</td>\n",
|
||
" <td>4044.0</td>\n",
|
||
" <td>There are many approaches that aim to make a t...</td>\n",
|
||
" <td>69.0</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>\n",
|
||
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-8ac2298f-ac6d-46f5-aa1d-41dec7fe27b5')\"\n",
|
||
" title=\"Convert this dataframe to an interactive table.\"\n",
|
||
" style=\"display:none;\">\n",
|
||
" \n",
|
||
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
|
||
" width=\"24px\">\n",
|
||
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
|
||
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
|
||
" </svg>\n",
|
||
" </button>\n",
|
||
" \n",
|
||
" <style>\n",
|
||
" .colab-df-container {\n",
|
||
" display:flex;\n",
|
||
" flex-wrap:wrap;\n",
|
||
" gap: 12px;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .colab-df-convert {\n",
|
||
" background-color: #E8F0FE;\n",
|
||
" border: none;\n",
|
||
" border-radius: 50%;\n",
|
||
" cursor: pointer;\n",
|
||
" display: none;\n",
|
||
" fill: #1967D2;\n",
|
||
" height: 32px;\n",
|
||
" padding: 0 0 0 0;\n",
|
||
" width: 32px;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .colab-df-convert:hover {\n",
|
||
" background-color: #E2EBFA;\n",
|
||
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
|
||
" fill: #174EA6;\n",
|
||
" }\n",
|
||
"\n",
|
||
" [theme=dark] .colab-df-convert {\n",
|
||
" background-color: #3B4455;\n",
|
||
" fill: #D2E3FC;\n",
|
||
" }\n",
|
||
"\n",
|
||
" [theme=dark] .colab-df-convert:hover {\n",
|
||
" background-color: #434B5C;\n",
|
||
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
|
||
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
|
||
" fill: #FFFFFF;\n",
|
||
" }\n",
|
||
" </style>\n",
|
||
"\n",
|
||
" <script>\n",
|
||
" const buttonEl =\n",
|
||
" document.querySelector('#df-8ac2298f-ac6d-46f5-aa1d-41dec7fe27b5 button.colab-df-convert');\n",
|
||
" buttonEl.style.display =\n",
|
||
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
|
||
"\n",
|
||
" async function convertToInteractive(key) {\n",
|
||
" const element = document.querySelector('#df-8ac2298f-ac6d-46f5-aa1d-41dec7fe27b5');\n",
|
||
" const dataTable =\n",
|
||
" await google.colab.kernel.invokeFunction('convertToInteractive',\n",
|
||
" [key], {});\n",
|
||
" if (!dataTable) return;\n",
|
||
"\n",
|
||
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
|
||
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
|
||
" + ' to learn more about interactive tables.';\n",
|
||
" element.innerHTML = '';\n",
|
||
" dataTable['output_type'] = 'display_data';\n",
|
||
" await google.colab.output.renderOutput(dataTable, element);\n",
|
||
" const docLink = document.createElement('div');\n",
|
||
" docLink.innerHTML = docLinkHtml;\n",
|
||
" element.appendChild(docLink);\n",
|
||
" }\n",
|
||
" </script>\n",
|
||
" </div>\n",
|
||
" </div>\n",
|
||
" "
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"execution_count": 221
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"source": [
|
||
"testing_id = df.Id_q.mode()[0]\n",
|
||
"df[(df.Id_q == testing_id) | (df.ParentId_a == testing_id)][\n",
|
||
" [\"Id_q\", \"Question\", \"ParentId_a\", \"AcceptedAnswerId\", \"Id_a\", \"Answer\", \"AnswerScore\", \"AcceptedAnswerFlag\"]\n",
|
||
"]\n",
|
||
"# df[['Id_q', 'Question', 'ParentId_a', 'AcceptedAnswerId', 'Id_a', 'Answer', 'AnswerScore', 'AcceptedAnswerFlag']]"
|
||
],
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/",
|
||
"height": 645
|
||
},
|
||
"id": "eds1K8WL9QPo",
|
||
"outputId": "bc526503-d6dd-428f-fa98-ad419d26a7dc"
|
||
},
|
||
"execution_count": null,
|
||
"outputs": [
|
||
{
|
||
"output_type": "execute_result",
|
||
"data": {
|
||
"text/plain": [
|
||
" Id_q Question ParentId_a \\\n",
|
||
"7 15730 As a human being, we can think infinity. In pr... 15730.0 \n",
|
||
"3662 15730 As a human being, we can think infinity. In pr... 15730.0 \n",
|
||
"3713 15730 As a human being, we can think infinity. In pr... 15730.0 \n",
|
||
"3788 15730 As a human being, we can think infinity. In pr... 15730.0 \n",
|
||
"3821 15730 As a human being, we can think infinity. In pr... 15730.0 \n",
|
||
"3882 15730 As a human being, we can think infinity. In pr... 15730.0 \n",
|
||
"4389 15730 As a human being, we can think infinity. In pr... 15730.0 \n",
|
||
"4849 15730 As a human being, we can think infinity. In pr... 15730.0 \n",
|
||
"4850 15730 As a human being, we can think infinity. In pr... 15730.0 \n",
|
||
"5763 15730 As a human being, we can think infinity. In pr... 15730.0 \n",
|
||
"5764 15730 As a human being, we can think infinity. In pr... 15730.0 \n",
|
||
"5765 15730 As a human being, we can think infinity. In pr... 15730.0 \n",
|
||
"7462 15730 As a human being, we can think infinity. In pr... 15730.0 \n",
|
||
"7463 15730 As a human being, we can think infinity. In pr... 15730.0 \n",
|
||
"7464 15730 As a human being, we can think infinity. In pr... 15730.0 \n",
|
||
"7465 15730 As a human being, we can think infinity. In pr... 15730.0 \n",
|
||
"7466 15730 As a human being, we can think infinity. In pr... 15730.0 \n",
|
||
"7467 15730 As a human being, we can think infinity. In pr... 15730.0 \n",
|
||
"9481 15730 As a human being, we can think infinity. In pr... 15730.0 \n",
|
||
"\n",
|
||
" AcceptedAnswerId Id_a \\\n",
|
||
"7 15744 15744.0 \n",
|
||
"3662 15744 15753.0 \n",
|
||
"3713 15744 15747.0 \n",
|
||
"3788 15744 15756.0 \n",
|
||
"3821 15744 15758.0 \n",
|
||
"3882 15744 15762.0 \n",
|
||
"4389 15744 15783.0 \n",
|
||
"4849 15744 15740.0 \n",
|
||
"4850 15744 15803.0 \n",
|
||
"5763 15744 15768.0 \n",
|
||
"5764 15744 15810.0 \n",
|
||
"5765 15744 15943.0 \n",
|
||
"7462 15744 15779.0 \n",
|
||
"7463 15744 15787.0 \n",
|
||
"7464 15744 15801.0 \n",
|
||
"7465 15744 15930.0 \n",
|
||
"7466 15744 15934.0 \n",
|
||
"7467 15744 15938.0 \n",
|
||
"9481 15744 15931.0 \n",
|
||
"\n",
|
||
" Answer AnswerScore \\\n",
|
||
"7 I think this is a fairly common misconception ... 62.0 \n",
|
||
"3662 I think your premise is flawed.\\nYou seem to a... 19.0 \n",
|
||
"3713 TL;DR: The subtleties of infinity are made app... 12.0 \n",
|
||
"3788 In Haskell, you can type:\\nprint [1..]\\nand it... 9.0 \n",
|
||
"3821 I believe humans can be said to understand inf... 8.0 \n",
|
||
"3882 (There's a summary at the bottom for those who... 7.0 \n",
|
||
"4389 Then premise assumes that humans \"understand\" ... 4.0 \n",
|
||
"4849 By adding some rules for infinity in arithmeti... 3.0 \n",
|
||
"4850 I think the concept that is missing in the dis... 3.0 \n",
|
||
"5763 Computers don't understand \"infinity\" or even ... 2.0 \n",
|
||
"5764 The Questions That Computers Can Never Answer ... 2.0 \n",
|
||
"5765 John Doucette's answer covers my thoughts on t... 2.0 \n",
|
||
"7462 I would think that a computer couldn’t underst... 1.0 \n",
|
||
"7463 The \"concept\" of infinity is 1 thing to unders... 1.0 \n",
|
||
"7464 Just food for thought: how about if we try to ... 1.0 \n",
|
||
"7465 Its arguable if we humans understand infinity.... 1.0 \n",
|
||
"7466 Well -- just to touch on the question of peopl... 1.0 \n",
|
||
"7467 Humans certainly don't understand infinity. Cu... 1.0 \n",
|
||
"9481 I think the property humans have which compute... 0.0 \n",
|
||
"\n",
|
||
" AcceptedAnswerFlag \n",
|
||
"7 True \n",
|
||
"3662 False \n",
|
||
"3713 False \n",
|
||
"3788 False \n",
|
||
"3821 False \n",
|
||
"3882 False \n",
|
||
"4389 False \n",
|
||
"4849 False \n",
|
||
"4850 False \n",
|
||
"5763 False \n",
|
||
"5764 False \n",
|
||
"5765 False \n",
|
||
"7462 False \n",
|
||
"7463 False \n",
|
||
"7464 False \n",
|
||
"7465 False \n",
|
||
"7466 False \n",
|
||
"7467 False \n",
|
||
"9481 False "
|
||
],
|
||
"text/html": [
|
||
"\n",
|
||
" <div id=\"df-16d171db-e359-46f3-a969-510a35cee78f\">\n",
|
||
" <div class=\"colab-df-container\">\n",
|
||
" <div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Id_q</th>\n",
|
||
" <th>Question</th>\n",
|
||
" <th>ParentId_a</th>\n",
|
||
" <th>AcceptedAnswerId</th>\n",
|
||
" <th>Id_a</th>\n",
|
||
" <th>Answer</th>\n",
|
||
" <th>AnswerScore</th>\n",
|
||
" <th>AcceptedAnswerFlag</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>15730</td>\n",
|
||
" <td>As a human being, we can think infinity. In pr...</td>\n",
|
||
" <td>15730.0</td>\n",
|
||
" <td>15744</td>\n",
|
||
" <td>15744.0</td>\n",
|
||
" <td>I think this is a fairly common misconception ...</td>\n",
|
||
" <td>62.0</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3662</th>\n",
|
||
" <td>15730</td>\n",
|
||
" <td>As a human being, we can think infinity. In pr...</td>\n",
|
||
" <td>15730.0</td>\n",
|
||
" <td>15744</td>\n",
|
||
" <td>15753.0</td>\n",
|
||
" <td>I think your premise is flawed.\\nYou seem to a...</td>\n",
|
||
" <td>19.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3713</th>\n",
|
||
" <td>15730</td>\n",
|
||
" <td>As a human being, we can think infinity. In pr...</td>\n",
|
||
" <td>15730.0</td>\n",
|
||
" <td>15744</td>\n",
|
||
" <td>15747.0</td>\n",
|
||
" <td>TL;DR: The subtleties of infinity are made app...</td>\n",
|
||
" <td>12.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3788</th>\n",
|
||
" <td>15730</td>\n",
|
||
" <td>As a human being, we can think infinity. In pr...</td>\n",
|
||
" <td>15730.0</td>\n",
|
||
" <td>15744</td>\n",
|
||
" <td>15756.0</td>\n",
|
||
" <td>In Haskell, you can type:\\nprint [1..]\\nand it...</td>\n",
|
||
" <td>9.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3821</th>\n",
|
||
" <td>15730</td>\n",
|
||
" <td>As a human being, we can think infinity. In pr...</td>\n",
|
||
" <td>15730.0</td>\n",
|
||
" <td>15744</td>\n",
|
||
" <td>15758.0</td>\n",
|
||
" <td>I believe humans can be said to understand inf...</td>\n",
|
||
" <td>8.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3882</th>\n",
|
||
" <td>15730</td>\n",
|
||
" <td>As a human being, we can think infinity. In pr...</td>\n",
|
||
" <td>15730.0</td>\n",
|
||
" <td>15744</td>\n",
|
||
" <td>15762.0</td>\n",
|
||
" <td>(There's a summary at the bottom for those who...</td>\n",
|
||
" <td>7.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4389</th>\n",
|
||
" <td>15730</td>\n",
|
||
" <td>As a human being, we can think infinity. In pr...</td>\n",
|
||
" <td>15730.0</td>\n",
|
||
" <td>15744</td>\n",
|
||
" <td>15783.0</td>\n",
|
||
" <td>Then premise assumes that humans \"understand\" ...</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4849</th>\n",
|
||
" <td>15730</td>\n",
|
||
" <td>As a human being, we can think infinity. In pr...</td>\n",
|
||
" <td>15730.0</td>\n",
|
||
" <td>15744</td>\n",
|
||
" <td>15740.0</td>\n",
|
||
" <td>By adding some rules for infinity in arithmeti...</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4850</th>\n",
|
||
" <td>15730</td>\n",
|
||
" <td>As a human being, we can think infinity. In pr...</td>\n",
|
||
" <td>15730.0</td>\n",
|
||
" <td>15744</td>\n",
|
||
" <td>15803.0</td>\n",
|
||
" <td>I think the concept that is missing in the dis...</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5763</th>\n",
|
||
" <td>15730</td>\n",
|
||
" <td>As a human being, we can think infinity. In pr...</td>\n",
|
||
" <td>15730.0</td>\n",
|
||
" <td>15744</td>\n",
|
||
" <td>15768.0</td>\n",
|
||
" <td>Computers don't understand \"infinity\" or even ...</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5764</th>\n",
|
||
" <td>15730</td>\n",
|
||
" <td>As a human being, we can think infinity. In pr...</td>\n",
|
||
" <td>15730.0</td>\n",
|
||
" <td>15744</td>\n",
|
||
" <td>15810.0</td>\n",
|
||
" <td>The Questions That Computers Can Never Answer ...</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5765</th>\n",
|
||
" <td>15730</td>\n",
|
||
" <td>As a human being, we can think infinity. In pr...</td>\n",
|
||
" <td>15730.0</td>\n",
|
||
" <td>15744</td>\n",
|
||
" <td>15943.0</td>\n",
|
||
" <td>John Doucette's answer covers my thoughts on t...</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7462</th>\n",
|
||
" <td>15730</td>\n",
|
||
" <td>As a human being, we can think infinity. In pr...</td>\n",
|
||
" <td>15730.0</td>\n",
|
||
" <td>15744</td>\n",
|
||
" <td>15779.0</td>\n",
|
||
" <td>I would think that a computer couldn’t underst...</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7463</th>\n",
|
||
" <td>15730</td>\n",
|
||
" <td>As a human being, we can think infinity. In pr...</td>\n",
|
||
" <td>15730.0</td>\n",
|
||
" <td>15744</td>\n",
|
||
" <td>15787.0</td>\n",
|
||
" <td>The \"concept\" of infinity is 1 thing to unders...</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7464</th>\n",
|
||
" <td>15730</td>\n",
|
||
" <td>As a human being, we can think infinity. In pr...</td>\n",
|
||
" <td>15730.0</td>\n",
|
||
" <td>15744</td>\n",
|
||
" <td>15801.0</td>\n",
|
||
" <td>Just food for thought: how about if we try to ...</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7465</th>\n",
|
||
" <td>15730</td>\n",
|
||
" <td>As a human being, we can think infinity. In pr...</td>\n",
|
||
" <td>15730.0</td>\n",
|
||
" <td>15744</td>\n",
|
||
" <td>15930.0</td>\n",
|
||
" <td>Its arguable if we humans understand infinity....</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7466</th>\n",
|
||
" <td>15730</td>\n",
|
||
" <td>As a human being, we can think infinity. In pr...</td>\n",
|
||
" <td>15730.0</td>\n",
|
||
" <td>15744</td>\n",
|
||
" <td>15934.0</td>\n",
|
||
" <td>Well -- just to touch on the question of peopl...</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7467</th>\n",
|
||
" <td>15730</td>\n",
|
||
" <td>As a human being, we can think infinity. In pr...</td>\n",
|
||
" <td>15730.0</td>\n",
|
||
" <td>15744</td>\n",
|
||
" <td>15938.0</td>\n",
|
||
" <td>Humans certainly don't understand infinity. Cu...</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9481</th>\n",
|
||
" <td>15730</td>\n",
|
||
" <td>As a human being, we can think infinity. In pr...</td>\n",
|
||
" <td>15730.0</td>\n",
|
||
" <td>15744</td>\n",
|
||
" <td>15931.0</td>\n",
|
||
" <td>I think the property humans have which compute...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>\n",
|
||
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-16d171db-e359-46f3-a969-510a35cee78f')\"\n",
|
||
" title=\"Convert this dataframe to an interactive table.\"\n",
|
||
" style=\"display:none;\">\n",
|
||
" \n",
|
||
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
|
||
" width=\"24px\">\n",
|
||
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
|
||
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
|
||
" </svg>\n",
|
||
" </button>\n",
|
||
" \n",
|
||
" <style>\n",
|
||
" .colab-df-container {\n",
|
||
" display:flex;\n",
|
||
" flex-wrap:wrap;\n",
|
||
" gap: 12px;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .colab-df-convert {\n",
|
||
" background-color: #E8F0FE;\n",
|
||
" border: none;\n",
|
||
" border-radius: 50%;\n",
|
||
" cursor: pointer;\n",
|
||
" display: none;\n",
|
||
" fill: #1967D2;\n",
|
||
" height: 32px;\n",
|
||
" padding: 0 0 0 0;\n",
|
||
" width: 32px;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .colab-df-convert:hover {\n",
|
||
" background-color: #E2EBFA;\n",
|
||
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
|
||
" fill: #174EA6;\n",
|
||
" }\n",
|
||
"\n",
|
||
" [theme=dark] .colab-df-convert {\n",
|
||
" background-color: #3B4455;\n",
|
||
" fill: #D2E3FC;\n",
|
||
" }\n",
|
||
"\n",
|
||
" [theme=dark] .colab-df-convert:hover {\n",
|
||
" background-color: #434B5C;\n",
|
||
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
|
||
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
|
||
" fill: #FFFFFF;\n",
|
||
" }\n",
|
||
" </style>\n",
|
||
"\n",
|
||
" <script>\n",
|
||
" const buttonEl =\n",
|
||
" document.querySelector('#df-16d171db-e359-46f3-a969-510a35cee78f button.colab-df-convert');\n",
|
||
" buttonEl.style.display =\n",
|
||
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
|
||
"\n",
|
||
" async function convertToInteractive(key) {\n",
|
||
" const element = document.querySelector('#df-16d171db-e359-46f3-a969-510a35cee78f');\n",
|
||
" const dataTable =\n",
|
||
" await google.colab.kernel.invokeFunction('convertToInteractive',\n",
|
||
" [key], {});\n",
|
||
" if (!dataTable) return;\n",
|
||
"\n",
|
||
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
|
||
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
|
||
" + ' to learn more about interactive tables.';\n",
|
||
" element.innerHTML = '';\n",
|
||
" dataTable['output_type'] = 'display_data';\n",
|
||
" await google.colab.output.renderOutput(dataTable, element);\n",
|
||
" const docLink = document.createElement('div');\n",
|
||
" docLink.innerHTML = docLinkHtml;\n",
|
||
" element.appendChild(docLink);\n",
|
||
" }\n",
|
||
" </script>\n",
|
||
" </div>\n",
|
||
" </div>\n",
|
||
" "
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"execution_count": 222
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"source": [
|
||
"# Create JSONL version of Dataframe\n",
|
||
"This groups the dataframe by question data and creates nested list of Answers for that group. The entire list contains individual JSON objects, each representing a single question in the dataset with a key, Answers, which contains a list of dictionaries for each answer to the question."
|
||
],
|
||
"metadata": {
|
||
"id": "gXgpXEO7DCbj"
|
||
}
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"source": [
|
||
"j = (\n",
|
||
" df.groupby(\n",
|
||
" [\"Title\", \"Question\", \"QuestionScore\", \"QuestionTags\", \"QuestionContentLicense\", \"DataSource\", \"CreationDate\"]\n",
|
||
" )\n",
|
||
" .apply(lambda x: x[[\"Answer\", \"AnswerScore\", \"AcceptedAnswerFlag\"]].to_dict(\"records\"))\n",
|
||
" .reset_index()\n",
|
||
" .rename(columns={0: \"Answers\"})\n",
|
||
" .to_json(orient=\"records\")\n",
|
||
")\n",
|
||
"\n",
|
||
"data = json.loads(j)\n",
|
||
"\n",
|
||
"for post in data:\n",
|
||
" if len(post.get(\"Answers\")) >= 4:\n",
|
||
" print(json.dumps(post, indent=4))\n",
|
||
" break"
|
||
],
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "OBR58MSRzAMP",
|
||
"outputId": "c7da1e6c-3a97-465d-c9ba-7e055cb0d751"
|
||
},
|
||
"execution_count": null,
|
||
"outputs": [
|
||
{
|
||
"output_type": "stream",
|
||
"name": "stdout",
|
||
"text": [
|
||
"{\n",
|
||
" \"Title\": \"1 hidden layer with 1000 neurons vs. 10 hidden layers with 100 neurons\",\n",
|
||
" \"Question\": \"These types of questions may be problem-dependent, but I have tried to find research that addresses the question whether the number of hidden layers and their size (number of neurons in each layer) really matter or not.\\nSo my question is, does it really matter if we for example have 1 large hidden layer of 1000 neurons vs. 10 hidden layers with 100 neurons each?\\n\",\n",
|
||
" \"QuestionScore\": 16,\n",
|
||
" \"QuestionTags\": \"neural networks\",\n",
|
||
" \"QuestionContentLicense\": \"CC BY-SA 3.0\",\n",
|
||
" \"DataSource\": \"https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/ai.stackexchange.com.7z&file=Posts.xml\",\n",
|
||
" \"CreationDate\": \"2017-05-04T13:06:37.990\",\n",
|
||
" \"Answers\": [\n",
|
||
" {\n",
|
||
" \"Answer\": \"Basically, having multiple layers (aka a deep network) makes your network more eager to recognize certain aspects of input data. For example, if you have the details of a house (size, lawn size, location etc.) as input and want to predict the price. The first layer may predict:\\n\\nBig area, higher price\\nSmall amount of bedrooms, lower price\\n\\nThe second layer might conclude:\\n\\nBig area + small amount of bedrooms = large bedrooms = +- effect\\n\\nYes, one layer can also 'detect' the stats, however it will require more neurons as it cannot rely on other neurons to do 'parts' of the total calculation required to detect that stat.\\nCheck out this answer\\n\",\n",
|
||
" \"AnswerScore\": 13.0,\n",
|
||
" \"AcceptedAnswerFlag\": true\n",
|
||
" },\n",
|
||
" {\n",
|
||
" \"Answer\": \"There are so many aspects.\\n1. Training:\\nTraining deep nets is a hard job due to the vanishing (rearly exploding) gradient problem. So building a 10x100 neural-net is not recommended.\\n2. Trained network performance:\\n\\nInformation loss:\\nThe classical usage of neural nets is the classification problem. Which means we want to get some well defined information from the data. (Ex. Is there a face in the picture or not.)\\nSo usually classification problem has a lot of input, and few output, whats more the size of the hidden layers are descend from input to output.\\nHowever, we loss information using less neurons layer by layer. (Ie. We cannot reproduce the original image based on the fact that is there a face on it or no.) So you must know that you loss information using 100 neurons if the size of the input is (lets say) 1000.\\nInformation complexity: However the deeper nets (as Tomas W mentioned) can fetch more complex information from the input data. Inspite of this its not recommended to use 10 fully connected layers. Its recommended to use convolutional/relu/maxpooling or other type of layers. Firest layers can compress the some essential part of the inputs. (Ex is there any line in a specific part of the picture) Second layers can say: There is a specific shape in this place in the picture. Etc etc.\\n\\nSo deeper nets are more \\\"clever\\\" but 10x100 net structure is a good choice.\\n\",\n",
|
||
" \"AnswerScore\": 4.0,\n",
|
||
" \"AcceptedAnswerFlag\": false\n",
|
||
" },\n",
|
||
" {\n",
|
||
" \"Answer\": \"If the problem you are solving is linearly separable, one layer of 1000 neurons can do better job than 10 layers with each of 100 neurons.\\nIf the problem is non linear and not convex, then you need deep neural nets. \\n\",\n",
|
||
" \"AnswerScore\": 1.0,\n",
|
||
" \"AcceptedAnswerFlag\": false\n",
|
||
" },\n",
|
||
" {\n",
|
||
" \"Answer\": \"\\nI think you have a confusion in the basics of the neural networks.\\n Every layer has a separate activation function and input/output\\n connection weights.\\n\\nThe output of the first hidden layer will be multiplied by a weight, processed by an activation function in the next layer and so on.\\nSingle layer neural networks are very limited for simple tasks, deeper NN can perform far better than a single layer. \\nHowever, do not use more than layer if your application is not fairly complex. In conclusion, 100 neurons layer does not mean better neural network than 10 layers x 10 neurons but 10 layers are something imaginary unless you are doing deep learning. start with 10 neurons in the hidden layer and try to add layers or add more neurons to the same layer to see the difference. learning with more layers will be easier but more training time is required.\\n\",\n",
|
||
" \"AnswerScore\": 0.0,\n",
|
||
" \"AcceptedAnswerFlag\": false\n",
|
||
" }\n",
|
||
" ]\n",
|
||
"}\n"
|
||
]
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"source": [
|
||
"# Save file\n",
|
||
"\n",
|
||
"Files can be saved as JSON, JSONL, CSV, or Parquet"
|
||
],
|
||
"metadata": {
|
||
"id": "PlNjrpXaDm1_"
|
||
}
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"source": [
|
||
"file_name = dataset_name\n",
|
||
"\n",
|
||
"\n",
|
||
"def save_data(data: list, file_name: str, file_type: str = [\"csv\", \"json\", \"jsonl\", \"parquet\"]):\n",
|
||
" \"\"\"\n",
|
||
" Save Data to file\n",
|
||
"\n",
|
||
" Save Data list to file as either JSON or JSONL\n",
|
||
"\n",
|
||
" Parameters:\n",
|
||
" data (list): list of dictionaries\n",
|
||
" file_name (str): name of file (no extension)\n",
|
||
" jsonl (bool): to save file as either JSON or JSONL\n",
|
||
" \"\"\"\n",
|
||
" file_type = file_type.lower()\n",
|
||
"\n",
|
||
" if file_type == \"csv\" and isinstance(data, pd.DataFrame):\n",
|
||
" data.to_csv(f\"/content/{file_name}.csv\", index=False)\n",
|
||
"\n",
|
||
" elif file_type == \"json\" and isinstance(data, list):\n",
|
||
" print(json.dumps(data, indent=4), file=open(f\"/content/{file_name}.json\", \"w\"))\n",
|
||
"\n",
|
||
" elif file_type == \"jsonl\" and isinstance(data, list):\n",
|
||
" for item in data:\n",
|
||
" print(json.dumps(item), file=open(f\"/content/{file_name}.jsonl\", \"a\"))\n",
|
||
"\n",
|
||
" elif file_type == \"parquet\" and isinstance(data, pd.DataFrame):\n",
|
||
" data.to_parquet(f\"/content/{file_name}.parquet\", index=False)\n",
|
||
"\n",
|
||
" else:\n",
|
||
" print(\"Data should be either of List type for JSON and JSONL, or Pandas Dataframes for CSV and Parquet\")\n",
|
||
"\n",
|
||
"\n",
|
||
"# save_data(data=data, file_name=file_name, file_type='jsonl')\n",
|
||
"# save_data(data=df, file_name=file_name, file_type='parquet')"
|
||
],
|
||
"metadata": {
|
||
"id": "CU0gWRGQDqIs",
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"outputId": "9646e475-cedd-46f1-f9b8-7eb1fbc703c7"
|
||
},
|
||
"execution_count": null,
|
||
"outputs": [
|
||
{
|
||
"output_type": "stream",
|
||
"name": "stdout",
|
||
"text": [
|
||
"Data should be either of List type for JSON and JSONL, or Pandas Dataframes for CSV and Parquet\n"
|
||
]
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"source": [
|
||
"# Open-Assistant Data Scheme\n",
|
||
"\n",
|
||
"Testing putting the data into the Open-Assistant Data Scheme\n",
|
||
"\n",
|
||
"https://github.com/LAION-AI/Open-Assistant/blob/main/docs/data_schemas.md"
|
||
],
|
||
"metadata": {
|
||
"id": "BdN3hKxtgH7f"
|
||
}
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"source": [
|
||
"from typing import TypeVar, List, Dict, Any, Literal\n",
|
||
"from json import JSONEncoder\n",
|
||
"\n",
|
||
"T = TypeVar(\"T\", bound=\"ConversationTreeNode\")\n",
|
||
"\n",
|
||
"\n",
|
||
"class ConversationTreeNode:\n",
|
||
" text: str # The text of the node\n",
|
||
" role: Literal[\"prompter\", \"assistant\"] # Whether the node is a user prompt/follow-up or an assistant response\n",
|
||
" children: List[T] # The children of the node (if you have a linear conversation, this will be of length 0 or 1)\n",
|
||
" metadata: Dict[str, Any] # Node metadata (see below)\n",
|
||
"\n",
|
||
" def __init__(\n",
|
||
" self, text: str, role: Literal[\"prompter\", \"assistant\"], children: List[T], metadata: Dict[str, Any]\n",
|
||
" ) -> None:\n",
|
||
" self.text = text\n",
|
||
" self.role = role\n",
|
||
" self.children = children\n",
|
||
" self.metadata = metadata\n",
|
||
"\n",
|
||
"\n",
|
||
"class ConversationTree:\n",
|
||
" root: ConversationTreeNode # The node containing the initial prompt\n",
|
||
" metadata: Dict[str, Any] # Tree metadata, different from root node metadata.\n",
|
||
"\n",
|
||
" def __init__(self, root: ConversationTreeNode, metadata: Dict[str, Any]) -> None:\n",
|
||
" self.root = root\n",
|
||
" self.metadata = metadata\n",
|
||
"\n",
|
||
"\n",
|
||
"# subclass JSONEncoder\n",
|
||
"class TreeEncoder(JSONEncoder):\n",
|
||
" def default(self, o):\n",
|
||
" return o.__dict__"
|
||
],
|
||
"metadata": {
|
||
"id": "n8ubYQxegNSY"
|
||
},
|
||
"execution_count": null,
|
||
"outputs": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"source": [
|
||
"conversation_forest = []\n",
|
||
"\n",
|
||
"tree_metadata_map = {\"Title\": str, \"QuestionContentLicense\": str, \"DataSource\": str, \"CreationDate\": str}\n",
|
||
"question_metadata_map = {\"QuestionScore\": int, \"QuestionTags\": str}\n",
|
||
"answer_metadata_map = {\"AnswerScore\": int, \"AcceptedAnswerFlag\": bool}\n",
|
||
"\n",
|
||
"\n",
|
||
"for item in data:\n",
|
||
" prompt = item.get(\"Question\")\n",
|
||
" metadata = {k: v for k, v in item.items() if k in question_metadata_map}\n",
|
||
" root = ConversationTreeNode(text=prompt, role=\"prompter\", children=[], metadata=metadata)\n",
|
||
"\n",
|
||
" for answer in item.get(\"Answers\"):\n",
|
||
" response = answer.get(\"Answer\")\n",
|
||
" metadata = {k: v for k, v in answer.items() if k in answer_metadata_map}\n",
|
||
" child = ConversationTreeNode(text=response, role=\"assistant\", children=[], metadata=metadata)\n",
|
||
" root.children.append(child)\n",
|
||
"\n",
|
||
" metadata = {k: v for k, v in item.items() if k in tree_metadata_map}\n",
|
||
" conversation_tree = ConversationTree(root=root, metadata=metadata)\n",
|
||
" conversation_forest.append(conversation_tree)\n",
|
||
"\n",
|
||
"\n",
|
||
"conversation_forest_json = [\n",
|
||
" json.loads(TreeEncoder().encode(conversation_tree)) for conversation_tree in conversation_forest\n",
|
||
"]\n",
|
||
"\n",
|
||
"\n",
|
||
"# print(json.dumps(conversation_forest_json[0], indent=4))\n",
|
||
"\n",
|
||
"\n",
|
||
"print(json.dumps(conversation_forest_json, indent=4), file=open(f\"/content/{file_name}.json\", \"w\"))"
|
||
],
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "eE0fkytExSGl",
|
||
"outputId": "594632d6-f98c-49b8-af86-25f7f5e2ce06"
|
||
},
|
||
"execution_count": null,
|
||
"outputs": [
|
||
{
|
||
"output_type": "stream",
|
||
"name": "stdout",
|
||
"text": [
|
||
"{\n",
|
||
" \"root\": {\n",
|
||
" \"text\": \"Science Fiction has frequently shown AI to be a threat to the very existence of mankind. AI systems have often been the antagonists in many works of fiction, from 2001: A Space Odyssey through to The Terminator and beyond.\\nThe Media seems to buy into this trope as well. And in recent years we have had people like Elon Musk warn us of the dangers of an impending AI revolution, stating that AI is more dangerous than nukes.\\nAnd, apparently, experts think that we will be seeing this AI revolution in the next 100 years.\\nHowever, from my (albeit limited) study of AI, I get the impression that they are all wrong. I am going to outline my understanding below, please correct me if I am wrong:\\n\\nFirstly, all of these things seem to be confusing Artificial Intelligence with Artificial Consciousness. AI is essentially a system to make intelligent decisions, whereas AC is more like the \\\"self-aware\\\" systems that are shown in science fiction.\\n\\nNot AI itself, but intelligence and intelligent decision-making algorithms are something we've been working with and enhancing since before computers have been around. Moving this over to an artificial framework is fairly easy. However, consciousness is still something we are learning about. My guess is we won't be able to re-create something artificially if we barely understand how it works in the real world.\\n\\nSo, my conclusion is that no AI system will be able to learn enough to start thinking for itself, and that all our warnings of AI are completely unjustified.\\n\\nThe real danger comes from AC, which we are a long, long way from realizing because we are still a long way off from defining exactly what consciousness is, let alone understanding it.\\n\\n\\n\\nSo, my question is, assuming that my understanding is correct, are any efforts are being made by companies or organizations that work with AI to correct these popular misunderstandings in sci-fi, the media, and/or the public?\\nOr are the proponents of AI ambivalent towards this public fear-mongering?\\nI understand that the fear mongering is going to remain popular for some time, as bad news sells better than good news. I am just wondering if the general attitude from AI organizations is to ignore this popular misconception, or whether a concerted effort is being made to fight against these AI myths (but unfortunately nobody in the media is listening or cares).\\n\",\n",
|
||
" \"role\": \"prompter\",\n",
|
||
" \"children\": [\n",
|
||
" {\n",
|
||
" \"text\": \"Nothing. \\nIts in almost everyone's favor for it to stay that way financially. Having non-technical individuals associate AI with terminators makes a perception that the field has greater capabilities than it does $\\\\rightarrow$ this leads to grants, funding, etc... \\nIs there any negative? Yes. Misconceptions always have drawbacks. We see the creation of dumb ethics boards and such cough cough Elon Musk.\\nBut if history has anything to say about this, as the field gains popularity (which it is dnagerously quick), information will spread by definition, and eventually misconceptions will be laid to rest.\\nNote that this answer is biased and based upon my own opinions\\n\",\n",
|
||
" \"role\": \"assistant\",\n",
|
||
" \"children\": [],\n",
|
||
" \"metadata\": {\n",
|
||
" \"AnswerScore\": 2.0,\n",
|
||
" \"AcceptedAnswerFlag\": true\n",
|
||
" }\n",
|
||
" }\n",
|
||
" ],\n",
|
||
" \"metadata\": {\n",
|
||
" \"QuestionScore\": 5,\n",
|
||
" \"QuestionTags\": \"social, artificial consciousness\"\n",
|
||
" }\n",
|
||
" },\n",
|
||
" \"metadata\": {\n",
|
||
" \"Title\": \"\\\"AI will kill us all! The machines will rise up!\\\" - what is being done to dispel such myths?\",\n",
|
||
" \"QuestionContentLicense\": \"CC BY-SA 4.0\",\n",
|
||
" \"DataSource\": \"https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/ai.stackexchange.com.7z&file=Posts.xml\",\n",
|
||
" \"CreationDate\": \"2019-10-16T13:57:37.143\"\n",
|
||
" }\n",
|
||
"}\n"
|
||
]
|
||
}
|
||
]
|
||
}
|
||
]
|
||
}
|