Cleanup notebooks folder (#716)

- standardize file naming to be a little more pythonic.
- one folder per notebook plus a README for easy GH viewing.
- adjust colab badges for new structure.
This commit is contained in:
Andrew Maguire
2023-01-14 22:51:09 +00:00
committed by GitHub
parent 0f607c7a28
commit 86cadc6e9f
16 changed files with 72 additions and 450 deletions
@@ -1,21 +0,0 @@
# Generate Topics, Questions, and Answers from a text
This python code can be used to generate topics, questions, and answers from a
paragraph of text. This is a good way to generate ground truth knowledge about a
topic from a trusted source.
The output of this is a dictionary with:
1. submitted paragraph
1. generated topics
1. generated questions
1. generated topic prefixes that can be prepended to the questions
1. open book answer based only on the provided paragraph
1. closed book answers generated by FLAN-T5-11B
## Contributing
This code is verified to work on a 24GB vram graphics card (like an RTX3090). We
are working on getting it to run on google colab TPUs and also it may be
possible to use smaller T5 models like the 3 billion parameter model and still
get acceptable results.
@@ -1,406 +0,0 @@
# This notebook will run on a system with a single RTX3090 (24 GB vram).
# You need to install accelerate, bitsandbytes, and transformers
import math
import pickle
import time
import torch
# load all needed libraries
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
# This device map will work a GPU with > 24GB vram.
# It uses nearly all the memory.
device_map_T5_13B = {
"shared": 0,
"decoder.embed_tokens": 0,
"encoder.embed_tokens": 0,
"encoder.block.0": 0,
"encoder.block.1": 0,
"encoder.block.2": 0,
"encoder.block.3": 0,
"encoder.block.4": 0,
"encoder.block.5": 0,
"encoder.block.6": 0,
"encoder.block.7": 0,
"encoder.block.8": 0,
"encoder.block.9": 0,
"encoder.block.10": 0,
"encoder.block.11": 0,
"encoder.block.12": 0,
"encoder.block.13": 0,
"encoder.block.14": 0,
"encoder.block.15": 0,
"encoder.block.16": 0,
"encoder.block.17": 0,
"encoder.block.18": 0,
"encoder.block.19": 0,
"encoder.block.20": 0,
"encoder.block.21": 0,
"encoder.block.22": 0,
"encoder.block.23": 0,
"encoder.final_layer_norm": 0,
"encoder.dropout": 0,
"decoder.block.0": 0,
"decoder.block.1": 0,
"decoder.block.2": 0,
"decoder.block.3": 0,
"decoder.block.4": 0,
"decoder.block.5": 0,
"decoder.block.6": 0,
"decoder.block.7": 0,
"decoder.block.8": 0,
"decoder.block.9": 0,
"decoder.block.10": 0,
"decoder.block.11": 0,
"decoder.block.12": 0,
"decoder.block.13": 0,
"decoder.block.14": 0,
"decoder.block.15": 0,
"decoder.block.16": 0,
"decoder.block.17": 0,
"decoder.block.18": 0,
"decoder.block.19": 0,
"decoder.block.20": 0,
"decoder.block.21": 0,
"decoder.block.22": 0,
"decoder.block.23": 0,
"decoder.final_layer_norm": 0,
"decoder.dropout": 0,
"lm_head": 0,
}
# Load the model in bfloat16. Make sure to use bfloat16
# if you are doing inference with 16bit precision.
tokenizer = AutoTokenizer.from_pretrained("flan-t5-xxl")
model = AutoModelForSeq2SeqLM.from_pretrained(
"flan-t5-xxl",
device_map=device_map_T5_13B,
torch_dtype=torch.bfloat16,
load_in_8bit=False,
)
# Load strings as knowledge sources for QA generation.
# You can do this with a pickle.
objects = []
with (open("paragraphs.pkl", "rb")) as openfile:
while True:
try:
objects.append(pickle.load(openfile))
except EOFError:
break
paragraphs = objects[0]
# Make sure no paragraphs are too long for T5.
# It handles up to 512 tokens context length.
fixed_paragraphs = []
for k in paragraphs:
if len(k) > 1100:
pass
else:
fixed_paragraphs.append(k)
print("Original number of paragraphs:", len(paragraphs))
print("Length filtered number of paragraphs:", len(fixed_paragraphs))
paragraphs = fixed_paragraphs
# Sort_Tuple sorts a list of tuples
# by the second element.
def Sort_Tuple(tup):
tup.sort(key=lambda x: x[1], reverse=True)
return tup
# ask_flan_T5 takes a text input and returns the
# response of FLAN_T5 and a normalized logits
# score for the generation.
def ask_flan_T5(input_text):
inputs = tokenizer.encode(input_text, return_tensors="pt").cuda(0)
outputs = model.generate(
inputs,
do_sample=True,
top_p=0.95,
eos_token_id=1,
max_new_tokens=50,
bos_token_id=0,
temperature=0.9,
return_dict_in_generate=True,
output_scores=True,
)
out_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
probs = torch.stack(outputs.scores, dim=1).softmax(-1)
for i in outputs.sequences:
logprobs = 0
counter = 0
for k in i[1:]:
word_prob = (round(probs[0][counter][k.item()].item(), 2)) + 0.001
logprobs = logprobs + math.log(word_prob)
counter += 1
out_tuple = (out_text, round(logprobs, 2))
return out_tuple
# ask_flan_T5D is a function that takes an input text and
# returns the deterministic(do_sample=False) output of
# FLAN_T5 and logits.
def ask_flan_T5D(input_text):
inputs = tokenizer.encode(input_text, return_tensors="pt").cuda(0)
outputs = model.generate(
inputs,
do_sample=False,
eos_token_id=1,
max_new_tokens=50,
bos_token_id=0,
return_dict_in_generate=True,
output_scores=True,
)
out_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
probs = torch.stack(outputs.scores, dim=1).softmax(-1)
for i in outputs.sequences:
logprobs = 0
counter = 0
for k in i[1:]:
word_prob = (round(probs[0][counter][k.item()].item(), 2)) + 0.001
logprobs = logprobs + math.log(word_prob)
counter += 1
out_tuple = (out_text, round(logprobs, 2))
return out_tuple
# Generate a topic classifier for a paragraph of text
def generate_topic(paragraph):
results = set()
input_text = (
"Task: Create a topic classifier for the provided \
paragraph.\nParagraph:\n"
+ paragraph
+ "\nTopic: "
)
for k in range(0, 20):
result = ask_flan_T5(input_text)
if result[1] > -4:
results.add(result)
if len(results) < 3:
results.add(("I was wondering", -3.3))
results.add(("I have a question", -3.3))
sorted_results = Sort_Tuple(list(results))
return sorted_results[0:5]
# Generate a topic classifier for a paragraph of text
def generate_topic_prefix(topic_set):
results = set()
for entry in topic_set:
topic = entry[0]
input_text = (
"Task: Create a prepositional phrase about the topic.\n\
Example 1\n Topic: climbing mount everest\nPrepositional \
Phrase: With regards to climbing mount everest,\nExample \
2\nTopic: United States Air Force\nPrepositional Phrase: \
On the topic of the United States Air Force,\n Example 3\nTopic: "
+ topic
+ "\nPrepositional Phrase: "
)
for k in range(0, 5):
results.add(ask_flan_T5(input_text))
sorted_results = Sort_Tuple(list(results))
return sorted_results[0:5]
# Generate who/what/where/when/why questions from a paragraph.
# Number of questions variable is an integer which indicates how
# many of each question type to try to generate.
def generate_questions(paragraph, number_of_questions):
if len(tokenizer.encode(paragraph)) > 480:
print("Warning, the context length is too long.")
question_set = set()
question_types = [
"What",
"Where",
"Why",
"How",
"Who",
]
for qtype in question_types:
question = (
"Please generate a question that starts with '"
+ qtype
+ "' based on the following paragraph.\nText:\n"
+ paragraph
+ "\nQuestion:\n"
)
for k in range(0, number_of_questions):
new_question = ask_flan_T5(question)
if qtype in new_question[0]:
question_set.add((qtype, new_question))
return question_set
# Generate answers for a set of questions.
# Input is the paragraph of text and a set of questions where each question
# is a tuple generated from the generate_questions() function.
def generate_answers(paragraph, question_set):
possible_answers = set()
for question in question_set:
input_text = (
"Please read the following paragraph and \
then answer the question using only data \
found in the text. If no answer is possible, respond \
'NA'.\nText:\n"
+ paragraph
+ "\nQuestion:\n"
+ question[1][0]
+ "\nAnswer:\n"
)
answer = ask_flan_T5D(input_text)
if "NA" in answer[0]:
pass
else:
possible_answers.add((question[0], question[1], answer))
return possible_answers
# Generate questions from a paragraph and set of answers.
# Input is the paragraph of text and a set of answers where each question
# is a tuple generated from the generate_answers() function.
def generate_question2(paragraph, qa_set):
qaq_results = set()
for qa_item in qa_set:
answer = qa_item[2][0]
input_text = (
"Please read the following paragraph and \
then generate a question whose answer is: "
+ answer
+ "\nParagraph:\n"
+ paragraph
+ "\nQuestion:\n"
)
result = ask_flan_T5D(input_text)
qaq_results.add((qa_item[0], qa_item[1], qa_item[2], result))
return qaq_results
# Generate answers from a paragraph and set of questions.
# Input is the paragraph of text and a set of questions where each answer
# is a tuple generated from the generate_questions2() function.
def generate_answers2(paragraph, question_set):
possible_answers = set()
for question in question_set:
input_text = (
"Please read the following paragraph and \
then answer the question using only data \
found in the text. If no answer is possible, respond \
'NA'.\nText:\n"
+ paragraph
+ "\nQuestion:\n"
+ question
+ "\nAnswer:\n"
)
answer = ask_flan_T5D(input_text)
possible_answers.add((question, answer))
return possible_answers
# Generate declarative statement from question and answer pair.
def generate_declarative(qaq_set):
qaqd_results = set()
for qa_item in qaq_set:
question = qa_item[0]
answer = qa_item[1][0]
if "NA" in answer:
pass
else:
input_text = (
"Generate a declarative statement based on the \
given question and answer pair.\nQ: What is \
sitting on the couch?\nA: poodle\nA poodle is \
sitting on the couch.\nQ: "
+ question
+ "\nA: "
+ answer
+ "\n"
)
result = ask_flan_T5D(input_text)
qaqd_results.add((question, answer, result))
return qaqd_results
# Generate closed book answer to question.
def generate_closed_answer(qaqd_set):
qaqd_results = set()
for qa_item in qaqd_set:
question = qa_item[0]
answer = qa_item[2][0]
if "NA" in answer:
# print(answer)
pass
else:
input_text = (
"Task: Answer the question in a detailed fashion. \
If the question cannot be answered without more \
information, please answer NA.\nExample 1:\nQuestion: \
Why does Shala like cookies?\nAnswer: It is not possible \
to know why Shala likes cookies without more information, \
but many people that like cookies enjoy their taste or \
some of their ingredients (e.g. chocolate chips or \
peanut butter).\nExample 2:\nQuestion: Why would someone \
vote in an election?\nAnswer: There are many reasons \
someone might vote in an election, for instance to have \
their voice heard or to help a candidate they like win the \
race.\nExample 3\nQuestion: What decoration goes on top of \
a Christmas tree?\nAnswer: Usually a star is placed at the \
top of a Christmas tree.\nExample 4:\nQuestion: "
+ question
+ "\nAnswer: "
)
result = ask_flan_T5D(input_text)
qaqd_results.add((qa_item[0], qa_item[1], qa_item[2], result))
return qaqd_results
# Create a dictionary of questions and answers from a list of paragraphs.
# Takes about 20 seconds per paragraph to process.
start_time = time.perf_counter()
questions_dict = {}
uniq_id = 100000
for paragraph in paragraphs[0:1500]:
topic_list = generate_topic(paragraph)
topic_prefix = generate_topic_prefix(topic_list)
question_set = generate_questions(paragraph, 2)
qa_set = generate_answers(paragraph, question_set)
qaq_set = generate_question2(paragraph, qa_set)
q2_set = set()
for q in qaq_set:
q2_set.add(q[3][0])
q2a2_set = generate_answers2(paragraph, q2_set)
a2d_set = generate_declarative(q2a2_set)
a3cb_set = generate_closed_answer(a2d_set)
questions_dict[uniq_id] = {}
questions_dict[uniq_id]["topics"] = topic_list
questions_dict[uniq_id]["topic prepositions"] = topic_prefix
questions_dict[uniq_id]["paragraph"] = paragraph
entry_count = 0
entry_dict = {}
for entry in a3cb_set:
entry_dict[entry_count] = {}
entry_dict[entry_count]["question"] = entry[0]
entry_dict[entry_count]["answer_T5_ob"] = entry[2][0]
entry_dict[entry_count]["answer_T5_cb"] = entry[3][0]
entry_count += 1
questions_dict[uniq_id]["QA_set"] = entry_dict
uniq_id += 1
print(uniq_id, "topics:", topic_prefix)
stop_time = time.perf_counter()
generation_time = stop_time - start_time
print(questions_dict[uniq_id - 1])
print(generation_time)
# create a binary pickle file to save your dictionary
f = open("questions_dict.pkl", "wb")
pickle.dump(questions_dict, f)
f.close()
+5
View File
@@ -0,0 +1,5 @@
# Data Augmentation
This folder contains subfolders of notebooks broadly relating to data
augmentation. Each subfolder contains a README.md file explaining what the
notebooks in that folder do.
@@ -5,7 +5,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/data-argumentation/EssayInstructions.ipynb)"
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/data-augmentation/essay-instructions/essay-instructions.ipynb)"
]
},
{
@@ -210,7 +210,7 @@
"provenance": []
},
"kernelspec": {
"display_name": "Python 3.8.10 64-bit",
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
@@ -224,11 +224,11 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.7.4 (tags/v3.7.4:e09359112e, Jul 8 2019, 20:34:20) [MSC v.1916 64 bit (AMD64)]"
},
"vscode": {
"interpreter": {
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
"hash": "25d5c2324055587ceaeef27650c79ce8358ea61d7689f2e0b8ada5d53f85bce4"
}
}
},
@@ -5,16 +5,24 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/data-argumentation/EssayRevision.ipynb)"
"# Essay Revision"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/data-augmentation/essay-revision/essay-revision.ipynb)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "o0lAqmWhsiUe"
},
"source": [
"#Essay Revision\n",
"The goal of this notebook is to use data argumentation to have data on improving essays. The way this is done is by taking a template \"good\" essay and making step by step changes that make it worse and add intructions on how to fix it."
]
},
@@ -319,11 +327,11 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
"version": "3.7.4 (tags/v3.7.4:e09359112e, Jul 8 2019, 20:34:20) [MSC v.1916 64 bit (AMD64)]"
},
"vscode": {
"interpreter": {
"hash": "492d89208e1af30f4727fd53e254ea56e6b1a843b376782bfa5f6ce13d676265"
"hash": "25d5c2324055587ceaeef27650c79ce8358ea61d7689f2e0b8ada5d53f85bce4"
}
}
},
@@ -5,16 +5,24 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/data-argumentation/StackExchangeBuilder.ipynb)"
"# Ingest StackExchange data dumps"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/data-augmentation/stackexchange-builder/stackexchange-builder.ipynb)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "TB7CEfs8F-8u"
},
"source": [
"# Ingest StackExchange data dumps\n",
"This notebook takes a StackExchange Data dump \"Posts.xml\" file and ingests it into a Pandas Dataframe. Outputs of the file can be JSON, JSONL, Parquet, or CSV. "
]
},
@@ -1842,10 +1850,17 @@
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python"
"name": "python",
"version": "3.7.4 (tags/v3.7.4:e09359112e, Jul 8 2019, 20:34:20) [MSC v.1916 64 bit (AMD64)]"
},
"vscode": {
"interpreter": {
"hash": "25d5c2324055587ceaeef27650c79ce8358ea61d7689f2e0b8ada5d53f85bce4"
}
}
},
"nbformat": 4,
@@ -9,11 +9,12 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "b2e3c95c",
"metadata": {},
"source": [
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/data-argumentation/UnifiedQA.ipynb)"
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/data-augmentation/unified-qa/unified-qa.ipynb)"
]
},
{
@@ -493,7 +494,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
@@ -507,7 +508,12 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.7.4 (tags/v3.7.4:e09359112e, Jul 8 2019, 20:34:20) [MSC v.1916 64 bit (AMD64)]"
},
"vscode": {
"interpreter": {
"hash": "25d5c2324055587ceaeef27650c79ce8358ea61d7689f2e0b8ada5d53f85bce4"
}
}
},
"nbformat": 4,
@@ -5,7 +5,15 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/detoxify-evaluation/DetoxityEvaluation.ipynb)"
"# Detoxify evaluation"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/detoxify-evaluation/detoxify-evaluation.ipynb)"
]
},
{
@@ -23,7 +31,6 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# Detoxify evaluation\n",
"[Detoxify](https://github.com/unitaryai/detoxify) is a open source model used to identify prompts as toxic\n",
"\n",
"<img src=\"https://raw.githubusercontent.com/unitaryai/detoxify/master/examples.png\" alt=\"Image from detoxify github that shows the example input/output of their model\" />\n",
@@ -472,7 +479,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "DetoxifyEvaluation",
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
@@ -486,12 +493,12 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
"version": "3.7.4 (tags/v3.7.4:e09359112e, Jul 8 2019, 20:34:20) [MSC v.1916 64 bit (AMD64)]"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "aeda4fe49bddd52f429be231bf767df53f2b167abae0a465a8ef142aa6b97b8a"
"hash": "25d5c2324055587ceaeef27650c79ce8358ea61d7689f2e0b8ada5d53f85bce4"
}
}
},
@@ -1,4 +1,4 @@
# OpenBuggerNotebook
# OpenBugger
https://github.com/furlat/OpenBugger/blob/main/README.md is a Python package
that allows you to inject syntax and logic errors into your code. This can be
@@ -5,7 +5,15 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/code-bugger/openbugger_example.ipynb)"
"# OpenBugger Example"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/openbugger/openbugger_example.ipynb)"
]
},
{
@@ -272,12 +280,12 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6 (tags/v3.10.6:9c7b4bd, Aug 1 2022, 21:53:49) [MSC v.1932 64 bit (AMD64)]"
"version": "3.7.4 (tags/v3.7.4:e09359112e, Jul 8 2019, 20:34:20) [MSC v.1916 64 bit (AMD64)]"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "ceba285e8b4e6478fe8ad229bc63940a90ad5cf3d143521e7c38823a2e915b21"
"hash": "25d5c2324055587ceaeef27650c79ce8358ea61d7689f2e0b8ada5d53f85bce4"
}
}
},