diff --git a/notebooks/data-argumentation/StackExchangeBuilder.ipynb b/notebooks/data-argumentation/StackExchangeBuilder.ipynb new file mode 100644 index 00000000..625d757b --- /dev/null +++ b/notebooks/data-argumentation/StackExchangeBuilder.ipynb @@ -0,0 +1,1845 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Ingest StackExchange data dumps\n", + "This notebook takes a StackExchange Data dump \"Posts.xml\" file and ingests it into a Pandas Dataframe. Outputs of the file can be JSON, JSONL, Parquet, or CSV. " + ], + "metadata": { + "id": "TB7CEfs8F-8u" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0rHryQttyzyY" + }, + "outputs": [], + "source": [ + "from bs4 import BeautifulSoup as bs\n", + "import pandas as pd\n", + "import requests\n", + "import json" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Extract StackExchange\n", + "Pull StackExchange file dumps. Specific column types are enforced to prevent errors on processing later in the notebook" + ], + "metadata": { + "id": "15mAL7GnzBv0" + } + }, + { + "cell_type": "code", + "source": [ + "base_url = \"https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/{0}&file=Posts.xml\"\n", + "\n", + "\n", + "def get_all_filenames():\n", + " response = requests.get(\"https://archive.org/download/stackexchange\")\n", + " if response.ok:\n", + " soup = bs(response.content, \"html.parser\")\n", + " table = soup.find(\"table\")\n", + " link_tags = table.find_all(\"a\")\n", + " urls = {}\n", + " for link in link_tags:\n", + " url = link[\"href\"]\n", + " name = url.split(\".stackexchange\")[0].replace(\".\", \"_\").replace(\"-\", \"_\")\n", + " if url.endswith(\"7z\"):\n", + " urls[name] = base_url.format(url)\n", + " return urls\n", + "\n", + "\n", + "urls = get_all_filenames()\n", + "\n", + "print(urls.keys())\n", + "print(urls.get(\"ai\"))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FtcvUEaHVxcW", + "outputId": "5b0cb19d-e3d9-422b-9077-52241bd09e0e" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "dict_keys(['3dprinting_meta', '3dprinting', 'Stackoverflow_com_Posts_7z', 'academia_meta', 'academia', 'ai_meta', 'ai', 'android_meta', 'android', 'anime_meta', 'anime', 'apple_meta', 'apple', 'arduino_meta', 'arduino', 'askubuntu_com_7z', 'astronomy_meta', 'astronomy', 'aviation_meta', 'aviation', 'avp_meta', 'avp', 'beer_meta', 'beer', 'bicycles_meta', 'bicycles', 'bioacoustics_meta', 'bioacoustics', 'bioinformatics_meta', 'bioinformatics', 'biology_meta', 'biology', 'bitcoin_meta', 'bitcoin', 'blender_meta', 'blender', 'boardgames_meta', 'boardgames', 'bricks_meta', 'bricks', 'buddhism_meta', 'buddhism', 'cardano_meta', 'cardano', 'chemistry_meta', 'chemistry', 'chess_meta', 'chess', 'chinese_meta', 'chinese', 'christianity_meta', 'christianity', 'civicrm_meta', 'civicrm', 'codegolf_meta', 'codegolf', 'codereview_meta', 'codereview', 'coffee_meta', 'coffee', 'cogsci_meta', 'cogsci', 'computergraphics_meta', 'computergraphics', 'conlang_meta', 'conlang', 'cooking_meta', 'cooking', 'craftcms_meta', 'craftcms', 'crafts_meta', 'crafts', 'crypto_meta', 'crypto', 'cs_meta', 'cs', 'cseducators_meta', 'cseducators', 'cstheory_meta', 'cstheory', 'datascience_meta', 'datascience', 'dba_meta', 'dba', 'devops_meta', 'devops', 'diy_meta', 'diy', 'drones_meta', 'drones', 'drupal_meta', 'drupal', 'dsp_meta', 'dsp', 'earthscience_meta', 'earthscience', 'ebooks_meta', 'ebooks', 'economics_meta', 'economics', 'electronics_meta', 'electronics', 'elementaryos_meta', 'elementaryos', 'ell_meta', 'ell', 'emacs_meta', 'emacs', 'engineering_meta', 'engineering', 'english_meta', 'english', 'eosio_meta', 'eosio', 'es_meta_stackoverflow_com_7z', 'es_stackoverflow_com_7z', 'esperanto_meta', 'esperanto', 'ethereum_meta', 'ethereum', 'expatriates_meta', 'expatriates', 'expressionengine_meta', 'expressionengine', 'fitness_meta', 'fitness', 'freelancing_meta', 'freelancing', 'french_meta', 'french', 'gamedev_meta', 'gamedev', 'gaming_meta', 'gaming', 'gardening_meta', 'gardening', 'genealogy_meta', 'genealogy', 'german_meta', 'german', 'gis_meta', 'gis', 'graphicdesign_meta', 'graphicdesign', 'ham_meta', 'ham', 'hardwarerecs_meta', 'hardwarerecs', 'health_meta', 'health', 'hermeneutics_meta', 'hermeneutics', 'hinduism_meta', 'hinduism', 'history_meta', 'history', 'homebrew_meta', 'homebrew', 'hsm_meta', 'hsm', 'interpersonal_meta', 'interpersonal', 'iot_meta', 'iot', 'iota_meta', 'iota', 'islam_meta', 'islam', 'italian_meta', 'italian', 'ja_meta_stackoverflow_com_7z', 'ja_stackoverflow_com_7z', 'japanese_meta', 'japanese', 'joomla_meta', 'joomla', 'judaism_meta', 'judaism', 'korean_meta', 'korean', 'languagelearning_meta', 'languagelearning', 'latin_meta', 'latin', 'law_meta', 'law', 'lifehacks_meta', 'lifehacks', 'linguistics_meta', 'linguistics', 'literature_meta', 'literature', 'magento_meta', 'magento', 'martialarts_meta', 'martialarts', 'materials_meta', 'materials', 'math_meta', 'math', 'matheducators_meta', 'matheducators', 'mathematica_meta', 'mathematica', 'mathoverflow_net_7z', 'mechanics_meta', 'mechanics', 'meta_askubuntu_com_7z', 'meta_mathoverflow_net_7z', 'meta_serverfault_com_7z', 'meta', 'meta_stackoverflow_com_7z', 'meta_superuser_com_7z', 'moderators_meta', 'moderators', 'monero_meta', 'monero', 'money_meta', 'money', 'movies_meta', 'movies', 'music_meta', 'music', 'musicfans_meta', 'musicfans', 'mythology_meta', 'mythology', 'networkengineering_meta', 'networkengineering', 'opendata_meta', 'opendata', 'opensource_meta', 'opensource', 'or_meta', 'or', 'outdoors_meta', 'outdoors', 'parenting_meta', 'parenting', 'patents_meta', 'patents', 'pets_meta', 'pets', 'philosophy_meta', 'philosophy', 'photo_meta', 'photo', 'physics_meta', 'physics', 'pm_meta', 'pm', 'poker_meta', 'poker', 'politics_meta', 'politics', 'portuguese_meta', 'portuguese', 'proofassistants_meta', 'proofassistants', 'pt_meta_stackoverflow_com_7z', 'pt_stackoverflow_com_7z', 'puzzling_meta', 'puzzling', 'quant_meta', 'quant', 'quantumcomputing_meta', 'quantumcomputing', 'raspberrypi_meta', 'raspberrypi', 'retrocomputing_meta', 'retrocomputing', 'reverseengineering_meta', 'reverseengineering', 'robotics_meta', 'robotics', 'rpg_meta', 'rpg', 'ru_meta_stackoverflow_com_7z', 'ru_stackoverflow_com_7z', 'rus_meta', 'rus', 'russian_meta', 'russian', 'salesforce_meta', 'salesforce', 'scicomp_meta', 'scicomp', 'scifi_meta', 'scifi', 'security_meta', 'security', 'serverfault_com_7z', 'sharepoint_meta', 'sharepoint', 'sitecore_meta', 'sitecore', 'skeptics_meta', 'skeptics', 'softwareengineering_meta', 'softwareengineering', 'softwarerecs_meta', 'softwarerecs', 'solana_meta', 'solana', 'sound_meta', 'sound', 'space_meta', 'space', 'spanish_meta', 'spanish', 'sports_meta', 'sports', 'sqa_meta', 'sqa', 'stackapps_com_7z', 'stackoverflow_com_Badges_7z', 'stackoverflow_com_Comments_7z', 'stackoverflow_com_PostHistory_7z', 'stackoverflow_com_PostLinks_7z', 'stackoverflow_com_Tags_7z', 'stackoverflow_com_Users_7z', 'stackoverflow_com_Votes_7z', 'stats_meta', 'stats', 'stellar_meta', 'stellar', 'substrate_meta', 'substrate', 'superuser_com_7z', 'sustainability_meta', 'sustainability', 'tex_meta', 'tex', 'tezos_meta', 'tezos', 'tor_meta', 'tor', 'travel_meta', 'travel', 'tridion_meta', 'tridion', 'ukrainian_meta', 'ukrainian', 'unix_meta', 'unix', 'ux_meta', 'ux', 'vegetarianism_meta', 'vegetarianism', 'vi_meta', 'vi', 'webapps_meta', 'webapps', 'webmasters_meta', 'webmasters', 'windowsphone_meta', 'windowsphone', 'woodworking_meta', 'woodworking', 'wordpress_meta', 'wordpress', 'workplace_meta', 'workplace', 'worldbuilding_meta', 'worldbuilding', 'writers_meta', 'writers'])\n", + "https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/ai.stackexchange.com.7z&file=Posts.xml\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "xml_format_map = {\n", + " \"Id\": int,\n", + " \"PostTypeId\": int,\n", + " \"CreationDate\": str,\n", + " \"Score\": int,\n", + " \"ViewCount\": int,\n", + " \"Body\": str,\n", + " \"AnswerCount\": int,\n", + " \"CommentCount\": int,\n", + " \"ContentLicense\": str,\n", + " \"AcceptedAnswerId\": int,\n", + " \"ParentId\": int,\n", + "}\n", + "\n", + "\n", + "# def extract_xml_file(file_url: str):\n", + "# table = pd.read_xml(file_url)\n", + "# return table\n", + "\n", + "\n", + "def xml_to_df(response: str):\n", + " \"\"\"\n", + " Collect and Manually import XML into Dataframe\n", + "\n", + " pd.read_xml() errors when XML trees are too large, this is just a hack to\n", + " download a XML file and parse into a Dataframe. **Not Tested on huge XML files**\n", + "\n", + " Parameters:\n", + " response (Requests.Response): Requests response object with the XML data\n", + "\n", + " Returns:\n", + " df (DataFrame): A Dataframe from the XML file\n", + " \"\"\"\n", + " soup = bs(response.content, \"xml\")\n", + " posts = soup.find_all(\"row\")\n", + "\n", + " all_posts = [post.attrs for post in posts]\n", + "\n", + " df = pd.DataFrame(all_posts)\n", + " df.AnswerCount.fillna(0, inplace=True)\n", + " df.ViewCount.fillna(0, inplace=True)\n", + " df.AcceptedAnswerId.fillna(0, inplace=True)\n", + " df.ParentId.fillna(0, inplace=True)\n", + " df[\"DataSource\"] = response.url\n", + " df = df.astype(xml_format_map)\n", + " return df\n", + "\n", + "\n", + "dataset_name = \"ai\"\n", + "\n", + "xml_posts_path = urls.get(dataset_name)\n", + "\n", + "\n", + "# df = extract_xml_file(test)\n", + "response = requests.get(xml_posts_path)\n", + "df = xml_to_df(response)\n", + "\n", + "\n", + "print(df.dtypes)\n", + "df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 907 + }, + "id": "-t27RnxdzBYB", + "outputId": "5ec0ceed-c82b-48fa-facd-41b4aae2f9e6" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Id int64\n", + "PostTypeId int64\n", + "AcceptedAnswerId int64\n", + "CreationDate object\n", + "Score int64\n", + "ViewCount int64\n", + "Body object\n", + "OwnerUserId object\n", + "LastEditorUserId object\n", + "LastEditDate object\n", + "LastActivityDate object\n", + "Title object\n", + "Tags object\n", + "AnswerCount int64\n", + "CommentCount int64\n", + "ContentLicense object\n", + "ParentId int64\n", + "ClosedDate object\n", + "FavoriteCount object\n", + "CommunityOwnedDate object\n", + "LastEditorDisplayName object\n", + "OwnerDisplayName object\n", + "DataSource object\n", + "dtype: object\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Id PostTypeId AcceptedAnswerId CreationDate Score \\\n", + "0 1 1 3 2016-08-02T15:39:14.947 10 \n", + "1 2 1 9 2016-08-02T15:40:20.623 14 \n", + "2 3 2 0 2016-08-02T15:40:24.820 15 \n", + "3 4 1 12 2016-08-02T15:41:22.020 33 \n", + "4 6 1 20 2016-08-02T15:43:35.460 7 \n", + "\n", + " ViewCount Body OwnerUserId \\\n", + "0 710

What does \"backprop\" mean? Is the \"backprop... 8 \n", + "1 1008

Does increasing the noise in data help to i... 8 \n", + "2 0

\"Backprop\" is the same as \"backpropagation\"... 4 \n", + "3 1266

When you're writing your algorithm, how do ... 8 \n", + "4 279

Given the following definition of an intell... 29 \n", + "\n", + " LastEditorUserId LastEditDate ... AnswerCount CommentCount \\\n", + "0 2444 2019-11-16T17:56:22.093 ... 5 0 \n", + "1 2444 2019-02-23T22:36:19.090 ... 3 0 \n", + "2 NaN NaN ... 0 0 \n", + "3 2444 2021-01-19T23:54:07.813 ... 4 0 \n", + "4 2444 2019-06-15T18:25:58.513 ... 2 0 \n", + "\n", + " ContentLicense ParentId ClosedDate FavoriteCount CommunityOwnedDate \\\n", + "0 CC BY-SA 4.0 0 NaN NaN NaN \n", + "1 CC BY-SA 4.0 0 NaN NaN NaN \n", + "2 CC BY-SA 3.0 1 NaN NaN NaN \n", + "3 CC BY-SA 3.0 0 NaN NaN NaN \n", + "4 CC BY-SA 4.0 0 NaN NaN NaN \n", + "\n", + " LastEditorDisplayName OwnerDisplayName \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "\n", + " DataSource \n", + "0 https://ia600107.us.archive.org/view_archive.p... \n", + "1 https://ia600107.us.archive.org/view_archive.p... \n", + "2 https://ia600107.us.archive.org/view_archive.p... \n", + "3 https://ia600107.us.archive.org/view_archive.p... \n", + "4 https://ia600107.us.archive.org/view_archive.p... \n", + "\n", + "[5 rows x 23 columns]" + ], + "text/html": [ + "\n", + "

\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdPostTypeIdAcceptedAnswerIdCreationDateScoreViewCountBodyOwnerUserIdLastEditorUserIdLastEditDate...AnswerCountCommentCountContentLicenseParentIdClosedDateFavoriteCountCommunityOwnedDateLastEditorDisplayNameOwnerDisplayNameDataSource
01132016-08-02T15:39:14.94710710<p>What does \"backprop\" mean? Is the \"backprop...824442019-11-16T17:56:22.093...50CC BY-SA 4.00NaNNaNNaNNaNNaNhttps://ia600107.us.archive.org/view_archive.p...
12192016-08-02T15:40:20.623141008<p>Does increasing the noise in data help to i...824442019-02-23T22:36:19.090...30CC BY-SA 4.00NaNNaNNaNNaNNaNhttps://ia600107.us.archive.org/view_archive.p...
23202016-08-02T15:40:24.820150<p>\"Backprop\" is the same as \"backpropagation\"...4NaNNaN...00CC BY-SA 3.01NaNNaNNaNNaNNaNhttps://ia600107.us.archive.org/view_archive.p...
341122016-08-02T15:41:22.020331266<p>When you're writing your algorithm, how do ...824442021-01-19T23:54:07.813...40CC BY-SA 3.00NaNNaNNaNNaNNaNhttps://ia600107.us.archive.org/view_archive.p...
461202016-08-02T15:43:35.4607279<p>Given the following definition of an intell...2924442019-06-15T18:25:58.513...20CC BY-SA 4.00NaNNaNNaNNaNNaNhttps://ia600107.us.archive.org/view_archive.p...
\n", + "

5 rows × 23 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 219 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Transformations" + ], + "metadata": { + "id": "RAzTR7zY3oan" + } + }, + { + "cell_type": "code", + "source": [ + "def filter_only_questions_with_accepted_answers(df):\n", + " \"\"\"**TODO**\n", + " Filter only to Questions with Accepted Answers\n", + "\n", + " Filter dataframe by questions that have accepted answers, should also include\n", + " all rows of answers for those questions, even if not accepted.\n", + "\n", + " Parameters:\n", + " df (DataFrame): containing a \"AcceptedAnswerId\", \"Id\", and \"ParentId\" columns\n", + "\n", + " Returns:\n", + " df (DataFrame): current dataframe with filtered results\n", + " \"\"\"\n", + " df = df[(df[\"AcceptedAnswerId\"].notnull()) | (df[\"ParentId\"] == df[\"Id\"])]\n", + "\n", + "\n", + "def filter_scores_above(df, question_score_threshold: int = 20, answer_score_threshold: int = 20):\n", + " \"\"\"**TODO**\n", + " Filter Dataframe by minimum scores\n", + "\n", + " Filter Question and Answer columns by score thresholds to trim lower scoring results\n", + "\n", + " Parameters:\n", + " df (DataFrame): containing a \"Score\" column\n", + "\n", + " Returns:\n", + " df (DataFrame): current dataframe with filtered results\n", + " \"\"\"\n", + " df = df[\n", + " ((df[\"Score\"] >= question_score_threshold) & (df.PostTypeId == 1))\n", + " | ((df[\"Score\"] >= answer_score_threshold) & (df.PostTypeId == 2))\n", + " ]\n", + "\n", + "\n", + "def convert_html_to_text(df, column: str = \"Body\"):\n", + " \"\"\"\n", + " Convert HTML tags to pure text\n", + "\n", + " Feeds HTML text body into BeautifulSoup to parse it to only text. Set aside as\n", + " function to provide option to skip\n", + "\n", + " Parameters:\n", + " df (DataFrame): containing a \"Body\" column with HTML\n", + "\n", + " Returns:\n", + " df (DataFrame): current dataframe with parsed column\n", + " \"\"\"\n", + " df.dropna(subset=[column], inplace=True)\n", + " df[f\"{column}Clean\"] = df[column].apply(lambda row: bs(row, \"html.parser\").text)\n", + "\n", + "\n", + "def clean_tags(df):\n", + " \"\"\"\n", + " Convert Tags into Comma separated\n", + "\n", + " Converts Tag slugs into commas separated tags\n", + "\n", + " Parameters:\n", + " df (DataFrame): containing a \"Tags\" column with slugs\n", + "\n", + " Returns:\n", + " df (DataFrame): current dataframe with parsed column\n", + " \"\"\"\n", + " df[\"TagsClean\"] = df[\"Tags\"].str.replace(\"-\", \" \").str.replace(\"><\", \", \").str.replace(\"<\", \"\").str.replace(\">\", \"\")\n", + "\n", + "\n", + "# filter_only_questions_with_accepted_answers(df)\n", + "# filter_scores_above(df)\n", + "convert_html_to_text(df)\n", + "clean_tags(df)\n", + "\n", + "df[[\"Body\", \"BodyClean\", \"Tags\", \"TagsClean\"]]\n", + "# print(df.shape)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "qyUqc31Z3Z9g", + "outputId": "18dce8b4-af26-49c9-ee73-6c677177b516" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Body \\\n", + "0

What does \"backprop\" mean? Is the \"backprop... \n", + "1

Does increasing the noise in data help to i... \n", + "2

\"Backprop\" is the same as \"backpropagation\"... \n", + "3

When you're writing your algorithm, how do ... \n", + "4

Given the following definition of an intell... \n", + "... ... \n", + "23174

The purpose of evaluating the state and act... \n", + "23175

In machine translation, convolution is a te... \n", + "23176

One of the key features of ChatGPT is its a... \n", + "23177

Given a neural network model for Covid-19 c... \n", + "23178

My question is more related to the fundamen... \n", + "\n", + " BodyClean \\\n", + "0 What does \"backprop\" mean? Is the \"backprop\" t... \n", + "1 Does increasing the noise in data help to impr... \n", + "2 \"Backprop\" is the same as \"backpropagation\": i... \n", + "3 When you're writing your algorithm, how do you... \n", + "4 Given the following definition of an intellige... \n", + "... ... \n", + "23174 The purpose of evaluating the state and action... \n", + "23175 In machine translation, convolution is a techn... \n", + "23176 One of the key features of ChatGPT is its abil... \n", + "23177 Given a neural network model for Covid-19 clas... \n", + "23178 My question is more related to the fundamental... \n", + "\n", + " Tags \\\n", + "0 ... \n", + "4 \n", + "... ... \n", + "23174 NaN \n", + "23175 NaN \n", + "23176 NaN \n", + "23177 \n", + "23178 \n", + "\n", + " TagsClean \n", + "0 neural networks, backpropagation, terminology,... \n", + "1 neural networks, machine learning, statistical... \n", + "2 NaN \n", + "3 neural networks, hyperparameter optimization, ... \n", + "4 philosophy, definitions, intelligent agent \n", + "... ... \n", + "23174 NaN \n", + "23175 NaN \n", + "23176 NaN \n", + "23177 neural networks, homework \n", + "23178 search, constraint satisfaction problems \n", + "\n", + "[23179 rows x 4 columns]" + ], + "text/html": [ + "\n", + "

\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BodyBodyCleanTagsTagsClean
0<p>What does \"backprop\" mean? Is the \"backprop...What does \"backprop\" mean? Is the \"backprop\" t...<neural-networks><backpropagation><terminology...neural networks, backpropagation, terminology,...
1<p>Does increasing the noise in data help to i...Does increasing the noise in data help to impr...<neural-networks><machine-learning><statistica...neural networks, machine learning, statistical...
2<p>\"Backprop\" is the same as \"backpropagation\"...\"Backprop\" is the same as \"backpropagation\": i...NaNNaN
3<p>When you're writing your algorithm, how do ...When you're writing your algorithm, how do you...<neural-networks><hyperparameter-optimization>...neural networks, hyperparameter optimization, ...
4<p>Given the following definition of an intell...Given the following definition of an intellige...<philosophy><definitions><intelligent-agent>philosophy, definitions, intelligent agent
...............
23174<p>The purpose of evaluating the state and act...The purpose of evaluating the state and action...NaNNaN
23175<p>In machine translation, convolution is a te...In machine translation, convolution is a techn...NaNNaN
23176<p>One of the key features of ChatGPT is its a...One of the key features of ChatGPT is its abil...NaNNaN
23177<p>Given a neural network model for Covid-19 c...Given a neural network model for Covid-19 clas...<neural-networks><homework>neural networks, homework
23178<p>My question is more related to the fundamen...My question is more related to the fundamental...<search><constraint-satisfaction-problems>search, constraint satisfaction problems
\n", + "

23179 rows × 4 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 220 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "This groups questions with answers so that a row with a question also has a column with an answer. It then creates an AcceptedAnswerFlag column that is True if the answer was accepted by the person who asked the question. Changing the `number_of_results` variable will limit the number of answers you want to keep." + ], + "metadata": { + "id": "C09Bwdw-44PZ" + } + }, + { + "cell_type": "code", + "source": [ + "questions = df[df.PostTypeId == 1]\n", + "answers = df[df.PostTypeId == 2]\n", + "\n", + "df = pd.merge(\n", + " questions,\n", + " answers[\n", + " [\n", + " \"Id\",\n", + " \"CreationDate\",\n", + " \"Score\",\n", + " \"ViewCount\",\n", + " \"CommentCount\",\n", + " \"ContentLicense\",\n", + " \"TagsClean\",\n", + " \"BodyClean\",\n", + " \"ParentId\",\n", + " ]\n", + " ],\n", + " left_on=\"Id\",\n", + " right_on=\"ParentId\",\n", + " suffixes=(\"_q\", \"_a\"),\n", + " how=\"left\",\n", + ")\n", + "\n", + "df[\"AcceptedAnswerFlag\"] = df.apply(lambda row: row[\"Id_a\"] == row[\"AcceptedAnswerId\"], axis=1)\n", + "\n", + "df = df.rename(\n", + " columns={\n", + " \"BodyClean_q\": \"Question\",\n", + " \"Score_q\": \"QuestionScore\",\n", + " \"TagsClean_q\": \"QuestionTags\",\n", + " \"BodyClean_a\": \"Answer\",\n", + " \"Score_a\": \"AnswerScore\",\n", + " \"ContentLicense_q\": \"QuestionContentLicense\",\n", + " \"ContentLicense_a\": \"AnswerContentLicense\",\n", + " \"CreationDate_q\": \"CreationDate\",\n", + " }\n", + ")\n", + "\n", + "## Set the number of results to a lower number to only return top N rated Answers.\n", + "number_of_results = 25\n", + "df = (\n", + " df.sort_values(by=[\"AcceptedAnswerFlag\", \"AnswerScore\"], ascending=[False, False])\n", + " .groupby(\"Question\")\n", + " .head(number_of_results)\n", + " .reset_index(drop=True)\n", + ")\n", + "\n", + "df[[\"Id_q\", \"Question\", \"QuestionScore\", \"QuestionTags\", \"Id_a\", \"Answer\", \"AnswerScore\", \"AcceptedAnswerFlag\"]].head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 337 + }, + "id": "Bgz2fZ9k43Ab", + "outputId": "28896d69-03cd-4877-fdfb-ae48dafa4ff3" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Id_q Question QuestionScore \\\n", + "0 1768 In Portal 2 we see that AI's can be \"killed\" b... 175 \n", + "1 10623 What is self-supervised learning in machine le... 91 \n", + "2 111 Obviously, self-driving cars aren't perfect, s... 100 \n", + "3 14224 If the original purpose for developing AI was ... 69 \n", + "4 1479 Do scientists or research experts know from th... 94 \n", + "\n", + " QuestionTags Id_a \\\n", + "0 philosophy, decision theory, mythology of ai, ... 1769.0 \n", + "1 machine learning, comparison, supervised learn... 10624.0 \n", + "2 philosophy, ethics, autonomous vehicles, decis... 1790.0 \n", + "3 philosophy, social, explainable ai 14247.0 \n", + "4 neural networks, deep learning, convolutional ... 4044.0 \n", + "\n", + " Answer AnswerScore \\\n", + "0 This classic problem exhibits a basic misunder... 146.0 \n", + "1 Introduction\\nThe term self-supervised learnin... 90.0 \n", + "2 \\nHow could self-driving cars make ethical dec... 76.0 \n", + "3 As argued by Selvaraju et al., there are three... 75.0 \n", + "4 There are many approaches that aim to make a t... 69.0 \n", + "\n", + " AcceptedAnswerFlag \n", + "0 True \n", + "1 True \n", + "2 True \n", + "3 True \n", + "4 True " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Id_qQuestionQuestionScoreQuestionTagsId_aAnswerAnswerScoreAcceptedAnswerFlag
01768In Portal 2 we see that AI's can be \"killed\" b...175philosophy, decision theory, mythology of ai, ...1769.0This classic problem exhibits a basic misunder...146.0True
110623What is self-supervised learning in machine le...91machine learning, comparison, supervised learn...10624.0Introduction\\nThe term self-supervised learnin...90.0True
2111Obviously, self-driving cars aren't perfect, s...100philosophy, ethics, autonomous vehicles, decis...1790.0\\nHow could self-driving cars make ethical dec...76.0True
314224If the original purpose for developing AI was ...69philosophy, social, explainable ai14247.0As argued by Selvaraju et al., there are three...75.0True
41479Do scientists or research experts know from th...94neural networks, deep learning, convolutional ...4044.0There are many approaches that aim to make a t...69.0True
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 221 + } + ] + }, + { + "cell_type": "code", + "source": [ + "testing_id = df.Id_q.mode()[0]\n", + "df[(df.Id_q == testing_id) | (df.ParentId_a == testing_id)][\n", + " [\"Id_q\", \"Question\", \"ParentId_a\", \"AcceptedAnswerId\", \"Id_a\", \"Answer\", \"AnswerScore\", \"AcceptedAnswerFlag\"]\n", + "]\n", + "# df[['Id_q', 'Question', 'ParentId_a', 'AcceptedAnswerId', 'Id_a', 'Answer', 'AnswerScore', 'AcceptedAnswerFlag']]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 645 + }, + "id": "eds1K8WL9QPo", + "outputId": "bc526503-d6dd-428f-fa98-ad419d26a7dc" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Id_q Question ParentId_a \\\n", + "7 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "3662 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "3713 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "3788 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "3821 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "3882 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "4389 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "4849 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "4850 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "5763 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "5764 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "5765 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "7462 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "7463 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "7464 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "7465 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "7466 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "7467 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "9481 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "\n", + " AcceptedAnswerId Id_a \\\n", + "7 15744 15744.0 \n", + "3662 15744 15753.0 \n", + "3713 15744 15747.0 \n", + "3788 15744 15756.0 \n", + "3821 15744 15758.0 \n", + "3882 15744 15762.0 \n", + "4389 15744 15783.0 \n", + "4849 15744 15740.0 \n", + "4850 15744 15803.0 \n", + "5763 15744 15768.0 \n", + "5764 15744 15810.0 \n", + "5765 15744 15943.0 \n", + "7462 15744 15779.0 \n", + "7463 15744 15787.0 \n", + "7464 15744 15801.0 \n", + "7465 15744 15930.0 \n", + "7466 15744 15934.0 \n", + "7467 15744 15938.0 \n", + "9481 15744 15931.0 \n", + "\n", + " Answer AnswerScore \\\n", + "7 I think this is a fairly common misconception ... 62.0 \n", + "3662 I think your premise is flawed.\\nYou seem to a... 19.0 \n", + "3713 TL;DR: The subtleties of infinity are made app... 12.0 \n", + "3788 In Haskell, you can type:\\nprint [1..]\\nand it... 9.0 \n", + "3821 I believe humans can be said to understand inf... 8.0 \n", + "3882 (There's a summary at the bottom for those who... 7.0 \n", + "4389 Then premise assumes that humans \"understand\" ... 4.0 \n", + "4849 By adding some rules for infinity in arithmeti... 3.0 \n", + "4850 I think the concept that is missing in the dis... 3.0 \n", + "5763 Computers don't understand \"infinity\" or even ... 2.0 \n", + "5764 The Questions That Computers Can Never Answer ... 2.0 \n", + "5765 John Doucette's answer covers my thoughts on t... 2.0 \n", + "7462 I would think that a computer couldn’t underst... 1.0 \n", + "7463 The \"concept\" of infinity is 1 thing to unders... 1.0 \n", + "7464 Just food for thought: how about if we try to ... 1.0 \n", + "7465 Its arguable if we humans understand infinity.... 1.0 \n", + "7466 Well -- just to touch on the question of peopl... 1.0 \n", + "7467 Humans certainly don't understand infinity. Cu... 1.0 \n", + "9481 I think the property humans have which compute... 0.0 \n", + "\n", + " AcceptedAnswerFlag \n", + "7 True \n", + "3662 False \n", + "3713 False \n", + "3788 False \n", + "3821 False \n", + "3882 False \n", + "4389 False \n", + "4849 False \n", + "4850 False \n", + "5763 False \n", + "5764 False \n", + "5765 False \n", + "7462 False \n", + "7463 False \n", + "7464 False \n", + "7465 False \n", + "7466 False \n", + "7467 False \n", + "9481 False " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Id_qQuestionParentId_aAcceptedAnswerIdId_aAnswerAnswerScoreAcceptedAnswerFlag
715730As a human being, we can think infinity. In pr...15730.01574415744.0I think this is a fairly common misconception ...62.0True
366215730As a human being, we can think infinity. In pr...15730.01574415753.0I think your premise is flawed.\\nYou seem to a...19.0False
371315730As a human being, we can think infinity. In pr...15730.01574415747.0TL;DR: The subtleties of infinity are made app...12.0False
378815730As a human being, we can think infinity. In pr...15730.01574415756.0In Haskell, you can type:\\nprint [1..]\\nand it...9.0False
382115730As a human being, we can think infinity. In pr...15730.01574415758.0I believe humans can be said to understand inf...8.0False
388215730As a human being, we can think infinity. In pr...15730.01574415762.0(There's a summary at the bottom for those who...7.0False
438915730As a human being, we can think infinity. In pr...15730.01574415783.0Then premise assumes that humans \"understand\" ...4.0False
484915730As a human being, we can think infinity. In pr...15730.01574415740.0By adding some rules for infinity in arithmeti...3.0False
485015730As a human being, we can think infinity. In pr...15730.01574415803.0I think the concept that is missing in the dis...3.0False
576315730As a human being, we can think infinity. In pr...15730.01574415768.0Computers don't understand \"infinity\" or even ...2.0False
576415730As a human being, we can think infinity. In pr...15730.01574415810.0The Questions That Computers Can Never Answer ...2.0False
576515730As a human being, we can think infinity. In pr...15730.01574415943.0John Doucette's answer covers my thoughts on t...2.0False
746215730As a human being, we can think infinity. In pr...15730.01574415779.0I would think that a computer couldn’t underst...1.0False
746315730As a human being, we can think infinity. In pr...15730.01574415787.0The \"concept\" of infinity is 1 thing to unders...1.0False
746415730As a human being, we can think infinity. In pr...15730.01574415801.0Just food for thought: how about if we try to ...1.0False
746515730As a human being, we can think infinity. In pr...15730.01574415930.0Its arguable if we humans understand infinity....1.0False
746615730As a human being, we can think infinity. In pr...15730.01574415934.0Well -- just to touch on the question of peopl...1.0False
746715730As a human being, we can think infinity. In pr...15730.01574415938.0Humans certainly don't understand infinity. Cu...1.0False
948115730As a human being, we can think infinity. In pr...15730.01574415931.0I think the property humans have which compute...0.0False
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 222 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Create JSONL version of Dataframe\n", + "This groups the dataframe by question data and creates nested list of Answers for that group. The entire list contains individual JSON objects, each representing a single question in the dataset with a key, Answers, which contains a list of dictionaries for each answer to the question." + ], + "metadata": { + "id": "gXgpXEO7DCbj" + } + }, + { + "cell_type": "code", + "source": [ + "j = (\n", + " df.groupby(\n", + " [\"Title\", \"Question\", \"QuestionScore\", \"QuestionTags\", \"QuestionContentLicense\", \"DataSource\", \"CreationDate\"]\n", + " )\n", + " .apply(lambda x: x[[\"Answer\", \"AnswerScore\", \"AcceptedAnswerFlag\"]].to_dict(\"records\"))\n", + " .reset_index()\n", + " .rename(columns={0: \"Answers\"})\n", + " .to_json(orient=\"records\")\n", + ")\n", + "\n", + "data = json.loads(j)\n", + "\n", + "for post in data:\n", + " if len(post.get(\"Answers\")) >= 4:\n", + " print(json.dumps(post, indent=4))\n", + " break" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OBR58MSRzAMP", + "outputId": "c7da1e6c-3a97-465d-c9ba-7e055cb0d751" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{\n", + " \"Title\": \"1 hidden layer with 1000 neurons vs. 10 hidden layers with 100 neurons\",\n", + " \"Question\": \"These types of questions may be problem-dependent, but I have tried to find research that addresses the question whether the number of hidden layers and their size (number of neurons in each layer) really matter or not.\\nSo my question is, does it really matter if we for example have 1 large hidden layer of 1000 neurons vs. 10 hidden layers with 100 neurons each?\\n\",\n", + " \"QuestionScore\": 16,\n", + " \"QuestionTags\": \"neural networks\",\n", + " \"QuestionContentLicense\": \"CC BY-SA 3.0\",\n", + " \"DataSource\": \"https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/ai.stackexchange.com.7z&file=Posts.xml\",\n", + " \"CreationDate\": \"2017-05-04T13:06:37.990\",\n", + " \"Answers\": [\n", + " {\n", + " \"Answer\": \"Basically, having multiple layers (aka a deep network) makes your network more eager to recognize certain aspects of input data. For example, if you have the details of a house (size, lawn size, location etc.) as input and want to predict the price. The first layer may predict:\\n\\nBig area, higher price\\nSmall amount of bedrooms, lower price\\n\\nThe second layer might conclude:\\n\\nBig area + small amount of bedrooms = large bedrooms = +- effect\\n\\nYes, one layer can also 'detect' the stats, however it will require more neurons as it cannot rely on other neurons to do 'parts' of the total calculation required to detect that stat.\\nCheck out this answer\\n\",\n", + " \"AnswerScore\": 13.0,\n", + " \"AcceptedAnswerFlag\": true\n", + " },\n", + " {\n", + " \"Answer\": \"There are so many aspects.\\n1. Training:\\nTraining deep nets is a hard job due to the vanishing (rearly exploding) gradient problem. So building a 10x100 neural-net is not recommended.\\n2. Trained network performance:\\n\\nInformation loss:\\nThe classical usage of neural nets is the classification problem. Which means we want to get some well defined information from the data. (Ex. Is there a face in the picture or not.)\\nSo usually classification problem has a lot of input, and few output, whats more the size of the hidden layers are descend from input to output.\\nHowever, we loss information using less neurons layer by layer. (Ie. We cannot reproduce the original image based on the fact that is there a face on it or no.) So you must know that you loss information using 100 neurons if the size of the input is (lets say) 1000.\\nInformation complexity: However the deeper nets (as Tomas W mentioned) can fetch more complex information from the input data. Inspite of this its not recommended to use 10 fully connected layers. Its recommended to use convolutional/relu/maxpooling or other type of layers. Firest layers can compress the some essential part of the inputs. (Ex is there any line in a specific part of the picture) Second layers can say: There is a specific shape in this place in the picture. Etc etc.\\n\\nSo deeper nets are more \\\"clever\\\" but 10x100 net structure is a good choice.\\n\",\n", + " \"AnswerScore\": 4.0,\n", + " \"AcceptedAnswerFlag\": false\n", + " },\n", + " {\n", + " \"Answer\": \"If the problem you are solving is linearly separable, one layer of 1000 neurons can do better job than 10 layers with each of 100 neurons.\\nIf the problem is non linear and not convex, then you need deep neural nets. \\n\",\n", + " \"AnswerScore\": 1.0,\n", + " \"AcceptedAnswerFlag\": false\n", + " },\n", + " {\n", + " \"Answer\": \"\\nI think you have a confusion in the basics of the neural networks.\\n Every layer has a separate activation function and input/output\\n connection weights.\\n\\nThe output of the first hidden layer will be multiplied by a weight, processed by an activation function in the next layer and so on.\\nSingle layer neural networks are very limited for simple tasks, deeper NN can perform far better than a single layer. \\nHowever, do not use more than layer if your application is not fairly complex. In conclusion, 100 neurons layer does not mean better neural network than 10 layers x 10 neurons but 10 layers are something imaginary unless you are doing deep learning. start with 10 neurons in the hidden layer and try to add layers or add more neurons to the same layer to see the difference. learning with more layers will be easier but more training time is required.\\n\",\n", + " \"AnswerScore\": 0.0,\n", + " \"AcceptedAnswerFlag\": false\n", + " }\n", + " ]\n", + "}\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Save file\n", + "\n", + "Files can be saved as JSON, JSONL, CSV, or Parquet" + ], + "metadata": { + "id": "PlNjrpXaDm1_" + } + }, + { + "cell_type": "code", + "source": [ + "file_name = dataset_name\n", + "\n", + "\n", + "def save_data(data: list, file_name: str, file_type: str = [\"csv\", \"json\", \"jsonl\", \"parquet\"]):\n", + " \"\"\"\n", + " Save Data to file\n", + "\n", + " Save Data list to file as either JSON or JSONL\n", + "\n", + " Parameters:\n", + " data (list): list of dictionaries\n", + " file_name (str): name of file (no extension)\n", + " jsonl (bool): to save file as either JSON or JSONL\n", + " \"\"\"\n", + " file_type = file_type.lower()\n", + "\n", + " if file_type == \"csv\" and isinstance(data, pd.DataFrame):\n", + " data.to_csv(f\"/content/{file_name}.csv\", index=False)\n", + "\n", + " elif file_type == \"json\" and isinstance(data, list):\n", + " print(json.dumps(data, indent=4), file=open(f\"/content/{file_name}.json\", \"w\"))\n", + "\n", + " elif file_type == \"jsonl\" and isinstance(data, list):\n", + " for item in data:\n", + " print(json.dumps(item), file=open(f\"/content/{file_name}.jsonl\", \"a\"))\n", + "\n", + " elif file_type == \"parquet\" and isinstance(data, pd.DataFrame):\n", + " data.to_parquet(f\"/content/{file_name}.parquet\", index=False)\n", + "\n", + " else:\n", + " print(\"Data should be either of List type for JSON and JSONL, or Pandas Dataframes for CSV and Parquet\")\n", + "\n", + "\n", + "# save_data(data=data, file_name=file_name, file_type='jsonl')\n", + "# save_data(data=df, file_name=file_name, file_type='parquet')" + ], + "metadata": { + "id": "CU0gWRGQDqIs", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "9646e475-cedd-46f1-f9b8-7eb1fbc703c7" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Data should be either of List type for JSON and JSONL, or Pandas Dataframes for CSV and Parquet\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Open-Assistant Data Scheme\n", + "\n", + "Testing putting the data into the Open-Assistant Data Scheme\n", + "\n", + "https://github.com/LAION-AI/Open-Assistant/blob/main/docs/data_schemas.md" + ], + "metadata": { + "id": "BdN3hKxtgH7f" + } + }, + { + "cell_type": "code", + "source": [ + "from typing import TypeVar, List, Dict, Any, Literal\n", + "from json import JSONEncoder\n", + "\n", + "T = TypeVar(\"T\", bound=\"ConversationTreeNode\")\n", + "\n", + "\n", + "class ConversationTreeNode:\n", + " text: str # The text of the node\n", + " role: Literal[\"prompter\", \"assistant\"] # Whether the node is a user prompt/follow-up or an assistant response\n", + " children: List[T] # The children of the node (if you have a linear conversation, this will be of length 0 or 1)\n", + " metadata: Dict[str, Any] # Node metadata (see below)\n", + "\n", + " def __init__(\n", + " self, text: str, role: Literal[\"prompter\", \"assistant\"], children: List[T], metadata: Dict[str, Any]\n", + " ) -> None:\n", + " self.text = text\n", + " self.role = role\n", + " self.children = children\n", + " self.metadata = metadata\n", + "\n", + "\n", + "class ConversationTree:\n", + " root: ConversationTreeNode # The node containing the initial prompt\n", + " metadata: Dict[str, Any] # Tree metadata, different from root node metadata.\n", + "\n", + " def __init__(self, root: ConversationTreeNode, metadata: Dict[str, Any]) -> None:\n", + " self.root = root\n", + " self.metadata = metadata\n", + "\n", + "\n", + "# subclass JSONEncoder\n", + "class TreeEncoder(JSONEncoder):\n", + " def default(self, o):\n", + " return o.__dict__" + ], + "metadata": { + "id": "n8ubYQxegNSY" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "conversation_forest = []\n", + "\n", + "tree_metadata_map = {\"Title\": str, \"QuestionContentLicense\": str, \"DataSource\": str, \"CreationDate\": str}\n", + "question_metadata_map = {\"QuestionScore\": int, \"QuestionTags\": str}\n", + "answer_metadata_map = {\"AnswerScore\": int, \"AcceptedAnswerFlag\": bool}\n", + "\n", + "\n", + "for item in data:\n", + " prompt = item.get(\"Question\")\n", + " metadata = {k: v for k, v in item.items() if k in question_metadata_map}\n", + " root = ConversationTreeNode(text=prompt, role=\"prompter\", children=[], metadata=metadata)\n", + "\n", + " for answer in item.get(\"Answers\"):\n", + " response = answer.get(\"Answer\")\n", + " metadata = {k: v for k, v in answer.items() if k in answer_metadata_map}\n", + " child = ConversationTreeNode(text=response, role=\"assistant\", children=[], metadata=metadata)\n", + " root.children.append(child)\n", + "\n", + " metadata = {k: v for k, v in item.items() if k in tree_metadata_map}\n", + " conversation_tree = ConversationTree(root=root, metadata=metadata)\n", + " conversation_forest.append(conversation_tree)\n", + "\n", + "\n", + "conversation_forest_json = [\n", + " json.loads(TreeEncoder().encode(conversation_tree)) for conversation_tree in conversation_forest\n", + "]\n", + "\n", + "\n", + "# print(json.dumps(conversation_forest_json[0], indent=4))\n", + "\n", + "\n", + "print(json.dumps(conversation_forest_json, indent=4), file=open(f\"/content/{file_name}.json\", \"w\"))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "eE0fkytExSGl", + "outputId": "594632d6-f98c-49b8-af86-25f7f5e2ce06" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{\n", + " \"root\": {\n", + " \"text\": \"Science Fiction has frequently shown AI to be a threat to the very existence of mankind. AI systems have often been the antagonists in many works of fiction, from 2001: A Space Odyssey through to The Terminator and beyond.\\nThe Media seems to buy into this trope as well. And in recent years we have had people like Elon Musk warn us of the dangers of an impending AI revolution, stating that AI is more dangerous than nukes.\\nAnd, apparently, experts think that we will be seeing this AI revolution in the next 100 years.\\nHowever, from my (albeit limited) study of AI, I get the impression that they are all wrong. I am going to outline my understanding below, please correct me if I am wrong:\\n\\nFirstly, all of these things seem to be confusing Artificial Intelligence with Artificial Consciousness. AI is essentially a system to make intelligent decisions, whereas AC is more like the \\\"self-aware\\\" systems that are shown in science fiction.\\n\\nNot AI itself, but intelligence and intelligent decision-making algorithms are something we've been working with and enhancing since before computers have been around. Moving this over to an artificial framework is fairly easy. However, consciousness is still something we are learning about. My guess is we won't be able to re-create something artificially if we barely understand how it works in the real world.\\n\\nSo, my conclusion is that no AI system will be able to learn enough to start thinking for itself, and that all our warnings of AI are completely unjustified.\\n\\nThe real danger comes from AC, which we are a long, long way from realizing because we are still a long way off from defining exactly what consciousness is, let alone understanding it.\\n\\n\\n\\nSo, my question is, assuming that my understanding is correct, are any efforts are being made by companies or organizations that work with AI to correct these popular misunderstandings in sci-fi, the media, and/or the public?\\nOr are the proponents of AI ambivalent towards this public fear-mongering?\\nI understand that the fear mongering is going to remain popular for some time, as bad news sells better than good news. I am just wondering if the general attitude from AI organizations is to ignore this popular misconception, or whether a concerted effort is being made to fight against these AI myths (but unfortunately nobody in the media is listening or cares).\\n\",\n", + " \"role\": \"prompter\",\n", + " \"children\": [\n", + " {\n", + " \"text\": \"Nothing. \\nIts in almost everyone's favor for it to stay that way financially. Having non-technical individuals associate AI with terminators makes a perception that the field has greater capabilities than it does $\\\\rightarrow$ this leads to grants, funding, etc... \\nIs there any negative? Yes. Misconceptions always have drawbacks. We see the creation of dumb ethics boards and such cough cough Elon Musk.\\nBut if history has anything to say about this, as the field gains popularity (which it is dnagerously quick), information will spread by definition, and eventually misconceptions will be laid to rest.\\nNote that this answer is biased and based upon my own opinions\\n\",\n", + " \"role\": \"assistant\",\n", + " \"children\": [],\n", + " \"metadata\": {\n", + " \"AnswerScore\": 2.0,\n", + " \"AcceptedAnswerFlag\": true\n", + " }\n", + " }\n", + " ],\n", + " \"metadata\": {\n", + " \"QuestionScore\": 5,\n", + " \"QuestionTags\": \"social, artificial consciousness\"\n", + " }\n", + " },\n", + " \"metadata\": {\n", + " \"Title\": \"\\\"AI will kill us all! The machines will rise up!\\\" - what is being done to dispel such myths?\",\n", + " \"QuestionContentLicense\": \"CC BY-SA 4.0\",\n", + " \"DataSource\": \"https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/ai.stackexchange.com.7z&file=Posts.xml\",\n", + " \"CreationDate\": \"2019-10-16T13:57:37.143\"\n", + " }\n", + "}\n" + ] + } + ] + } + ] +} diff --git a/notebooks/data-argumentation/StackExchangeBuilder.md b/notebooks/data-argumentation/StackExchangeBuilder.md new file mode 100644 index 00000000..74a49872 --- /dev/null +++ b/notebooks/data-argumentation/StackExchangeBuilder.md @@ -0,0 +1,106 @@ +# StackExchange Builder + +StackExchange Builder is a notebook that downloads data from StackExchange data +dumps and converts it into different formats. It will parse the XML files, group +questions and answers, can filter the dataset and puts the results into the +Open-Assistant Data Scheme. Files can be saved to either JSON, JSONL, Parquet, +or CSV. + +--- + +#### Sample Data Open-Assistant Data Scheme: + +Open-Assistant Data Scheme as outlined here: +https://github.com/LAION-AI/Open-Assistant/blob/main/docs/data_schemas.md + +``` +{ + "root": { + "text": "Science Fiction has frequently shown AI to be a threat to the very existence of mankind. AI systems have often been the antagonists...", + "role": "prompter", + "children": [ + { + "text": "Nothing. \nIts in almost everyone's favor for it to stay that way financially. Having non-technical individuals associate AI with terminators...", + "role": "assistant", + "children": [], + "metadata": { + "AnswerScore": 2.0, + "AcceptedAnswerFlag": true + } + } + ], + "metadata": { + "QuestionScore": 5, + "QuestionTags": "social, artificial consciousness" + } + }, + "metadata": { + "Title": "\"AI will kill us all! The machines will rise up!\" - what is being done to dispel such myths?", + "QuestionContentLicense": "CC BY-SA 4.0", + "DataSource": "https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/ai.stackexchange.com.7z&file=Posts.xml", + "CreationDate": "2019-10-16T13:57:37.143" + } +} +``` + +--- + +#### JSONL format + +Each question and all related answers are on a single line in JSONL format. + +``` +{ + "Title": "1 hidden layer with 1000 neurons vs. 10 hidden layers with 100 neurons", + "Question": "These types of questions may be problem-dependent...", + "QuestionScore": 16, + "QuestionTags": "neural networks", + "QuestionContentLicense": "CC BY-SA 3.0", + "DataSource": "https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/ai.stackexchange.com.7z&file=Posts.xml", + "CreationDate": "2017-05-04T13:06:37.990", + "Answers": [ + { + "Answer": "Basically, having multiple layers (aka a deep network) makes your network more eager to recognize certain aspects of input data...", + "AnswerScore": 13.0, + "AcceptedAnswerFlag": true + }, + { + "Answer": "There are so many aspects.\n1. Training:\nTraining deep nets is a hard job due to the vanishing (rearly exploding) gradient problem...", + "AnswerScore": 4.0, + "AcceptedAnswerFlag": false + }, + { + "Answer": "If the problem you are solving is linearly separable, one layer of 1000 neurons can do better job...", + "AnswerScore": 1.0, + "AcceptedAnswerFlag": false + }, + { + "Answer": "\nI think you have a confusion in the basics of the neural networks.\n Every layer has a separate activation...", + "AnswerScore": 0.0, + "AcceptedAnswerFlag": false + } + ] +} +``` + +#### Table/CSV/Parquet Format + +There are a lot more columns left over in the table format. `_q` and `_a` are +suffixes indiciating if the column came from the question or answer table as +leftover from a join statement. + +``` +| Id_q | Question | ParentId_a | AcceptedAnswerId | Id_a | Answer | AnswerScore | AcceptedAnswerFlag | +|------:|--------------------------------------------------:|-----------:|-----------------:|--------:|--------------------------------------------------:|------------:|-------------------:| +| 15730 | As a human being, we can think infinity. In pr... | 15730.0 | 15744 | 15744.0 | I think this is a fairly common misconception ... | 62.0 | True | +| 15730 | As a human being, we can think infinity. In pr... | 15730.0 | 15744 | 15753.0 | I think your premise is flawed.\nYou seem to a... | 19.0 | False | +| 15730 | As a human being, we can think infinity. In pr... | 15730.0 | 15744 | 15747.0 | TL;DR: The subtleties of infinity are made app... | 12.0 | False | +| 15730 | As a human being, we can think infinity. In pr... | 15730.0 | 15744 | 15756.0 | In Haskell, you can type:\nprint [1..]\nand it... | 9.0 | False | +``` + +--- + +## Contributing + +Feel free to contribute to this notebook. It's not perfect and additional +functionality is planned.