From 34ab948adeefe5e6479ce486c71de4992687d455 Mon Sep 17 00:00:00 2001 From: Bobak Hashemi Date: Sun, 1 Jan 2023 23:30:12 -0500 Subject: [PATCH 001/121] testing rankgen integration into instructor trainer --- .../instructor/configs/rankgen-t5-base.yml | 15 +++ model/reward/instructor/models.py | 22 +++++ model/reward/instructor/rank_datasets.py | 23 +++++ model/reward/instructor/requirements.txt | 3 +- model/reward/instructor/trainer.py | 91 ++++++++++++++----- model/reward/instructor/utils.py | 7 +- 6 files changed, 133 insertions(+), 28 deletions(-) create mode 100644 model/reward/instructor/configs/rankgen-t5-base.yml create mode 100644 model/reward/instructor/models.py diff --git a/model/reward/instructor/configs/rankgen-t5-base.yml b/model/reward/instructor/configs/rankgen-t5-base.yml new file mode 100644 index 00000000..7dd39777 --- /dev/null +++ b/model/reward/instructor/configs/rankgen-t5-base.yml @@ -0,0 +1,15 @@ +model_name: kalpeshk2011/rankgen-t5-base-all +tokenizer_name: google/t5-v1_1-base +learning_rate: 6e-6 +gradient_checkpointing: false +gradient_accumulation_steps: 16 +per_device_train_batch_size: 3 +warmup_steps: 600 +freeze_layer: 20 +eval_steps: 200 +save_steps: 500 +max_length: 400 +num_train_epochs: 2 +datasets: + - webgpt + - hfsummary diff --git a/model/reward/instructor/models.py b/model/reward/instructor/models.py new file mode 100644 index 00000000..699f3566 --- /dev/null +++ b/model/reward/instructor/models.py @@ -0,0 +1,22 @@ +import torch +from transformers import AutoModel + +class RankGenModel(torch.nn.Module): + def __init__(self, model_name): + super().__init__() + self.rankgen_hf_hub = model_name + assert model_name in ["kalpeshk2011/rankgen-t5-xl-all", + "kalpeshk2011/rankgen-t5-xl-pg19", + "kalpeshk2011/rankgen-t5-base-all", + "kalpeshk2011/rankgen-t5-large-all"] + self.model = AutoModel.from_pretrained(self.rankgen_hf_hub, trust_remote_code=True) + + def forward(self, prefixes, suffixes): + embedded_prefixes = self.model(**prefixes) + embedded_suffixes = self.model(**suffixes) + # take dot product of each row independently + dot_products = torch.sum(embedded_prefixes * embedded_suffixes, dim=1) + + print(f"{prefixes=}, {suffixes=}, {embedded_prefixes=}, {embedded_suffixes=}, {dot_products=}") + + return dot_products \ No newline at end of file diff --git a/model/reward/instructor/rank_datasets.py b/model/reward/instructor/rank_datasets.py index 99ba9955..3b995a7d 100644 --- a/model/reward/instructor/rank_datasets.py +++ b/model/reward/instructor/rank_datasets.py @@ -24,9 +24,32 @@ from typing import Optional, Union import numpy as np from datasets import load_dataset +import torch from torch.utils.data import Dataset from transformers.tokenization_utils_base import PaddingStrategy, PreTrainedTokenizerBase +@dataclass +class RankGenCollator(): + tokenizer: PreTrainedTokenizerBase + padding: Union[bool, str, PaddingStrategy] = True + max_length: Optional[int] = None + + def __call__(self, batch : list[dict[str, str]]) -> dict[str, torch.Tensor]: + prefixes = [] + better_answers = [] + worse_answers = [] + for question, pairs in batch: + for (pos, neg) in pairs: + prefixes.append("pre " + question) + better_answers.append("suffi " + pos) + worse_answers.append("suffi " + neg) + + tokenized_prefixes = self.tokenizer(prefixes, return_tensors="pt", padding=self.padding, max_length=self.max_length, truncation=True) + tokenized_pos = self.tokenizer(better_answers, return_tensors="pt", padding=self.padding, max_length=self.max_length, truncation=True) + tokenized_neg = self.tokenizer(worse_answers, return_tensors="pt", padding=self.padding, max_length=self.max_length, truncation=True) + return {"prefix" : tokenized_prefixes, + "positive": tokenized_pos, + "negative": tokenized_neg} @dataclass class DataCollatorForPairRank: diff --git a/model/reward/instructor/requirements.txt b/model/reward/instructor/requirements.txt index e225a2ca..eaaf36e6 100644 --- a/model/reward/instructor/requirements.txt +++ b/model/reward/instructor/requirements.txt @@ -1,6 +1,7 @@ datasets==2.8.0 evaluate==0.4.0 scikit-learn==1.2.0 -torch==1.12.1+cu116 +torch>=1.12.1 transformers==4.25.1 wandb==0.13.7 +sentencepiece==0.1.97 diff --git a/model/reward/instructor/trainer.py b/model/reward/instructor/trainer.py index 0e98e4c5..5bb1017a 100644 --- a/model/reward/instructor/trainer.py +++ b/model/reward/instructor/trainer.py @@ -7,10 +7,11 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union import evaluate import numpy as np import torch -from rank_datasets import DataCollatorForPairRank, HFSummary, WebGPT +from rank_datasets import DataCollatorForPairRank, HFSummary, RankGenCollator, WebGPT from torch import nn from torch.utils.data import ConcatDataset, Dataset from transformers import ( + AutoModel, AutoModelForSequenceClassification, DataCollator, EvalPrediction, @@ -20,6 +21,7 @@ from transformers import ( TrainerCallback, TrainingArguments, ) +from models import RankGenModel from utils import argument_parsing, freeze_top_n_layers, get_tokenizer, train_val_dataset os.environ["WANDB_PROJECT"] = "reward-model" @@ -47,14 +49,17 @@ class RankLoss(nn.Module): self.log_sigmoid = nn.LogSigmoid() def forward(self, pos, neg): - return -self.log_sigmoid(pos - neg + self.eps).mean() + loss = -self.log_sigmoid(pos - neg + self.eps).mean() + print(f"in loss {pos=}, {neg=}, {loss=}") + return loss class RankTrainer(Trainer): def __init__( self, model: Union[PreTrainedModel, nn.Module] = None, - args: TrainingArguments = None, + model_name: str = None, + args: Optional[TrainingArguments] = None, data_collator: Optional[DataCollator] = None, train_dataset: Optional[Dataset] = None, eval_dataset: Optional[Dataset] = None, @@ -80,15 +85,26 @@ class RankTrainer(Trainer): ) self.loss_fct = RankLoss() if args.loss_function == "rank" else nn.CrossEntropyLoss() self.loss_function = args.loss_function + self.model_name = model_name def compute_loss(self, model, inputs, return_outputs=False): # forward pass - outputs = model(**inputs) - logits = outputs.get("logits").view(-1, 2) - if self.loss_function == "rank": - loss = self.loss_fct(logits[:, 0], logits[:, 1]) + if "rankgen" in self.model_name: + print(f"{inputs=}") + positive_outputs = model(inputs["prefix"], inputs["positive"]) + negative_outputs = model(inputs["prefix"], inputs["negative"]) + if self.loss_function == "rank": + loss = self.loss_fct(positive_outputs, negative_outputs) + else: + raise NotImplementedError("Only ranking loss has been implemented for rankgen model") + outputs = torch.hstack((positive_outputs, negative_outputs)) #logits else: - loss = self.loss_fct(logits, torch.zeros(logits.shape[0], device=logits.device, dtype=torch.long)) + outputs = model(**inputs) + logits = outputs.get("logits").view(-1, 2) + if self.loss_function == "rank": + loss = self.loss_fct(logits[:, 0], logits[:, 1]) + else: + loss = self.loss_fct(logits, torch.zeros(logits.shape[0], device=logits.device, dtype=torch.long)) return (loss, outputs) if return_outputs else loss @@ -110,32 +126,44 @@ class RankTrainer(Trainer): prediction_loss_only: bool, ignore_keys: Optional[List[str]] = None, ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: + with torch.inference_mode(): + if "rankgen" in self.model_name: + inputs = self._prepare_inputs(inputs) + positive_outputs = model(inputs["prefix"], inputs["positive"]) + negative_outputs = model(inputs["prefix"], inputs["negative"]) + if self.loss_function == "rank": + loss = self.loss_fct(positive_outputs, negative_outputs) + else: + raise NotImplementedError("Only ranking loss has been implemented for rankgen model") + outputs = torch.hstack((positive_outputs, negative_outputs)) # logits + return (loss, outputs, None) + else: + # compute loss on predict data + loss, logits = self._compute_loss(model, inputs) - with torch.no_grad(): - # compute loss on predict data - loss, logits = self._compute_loss(model, inputs) + loss = loss.mean().detach() + labels = torch.zeros(logits.shape[0], device=logits.device, dtype=torch.long) + if self.args.prediction_loss_only: + return (loss, None, None) - loss = loss.mean().detach() - labels = torch.zeros(logits.shape[0], device=logits.device, dtype=torch.long) - if self.args.prediction_loss_only: - return (loss, None, None) - - return (loss, logits, labels) + return (loss, logits, labels) if __name__ == "__main__": training_conf = argument_parsing(parser) model_name = training_conf["model_name"] - model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1, problem_type="regression") + if "rankgen-t5" in model_name: + model = RankGenModel(model_name) + else: + model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1, problem_type="regression") if "freeze_layer" in training_conf: num_layer = training_conf["freeze_layer"] model = freeze_top_n_layers(model, num_layer) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print("Number of trainable : {}M".format(int(params / 1e6))) - - tokenizer = get_tokenizer(model_name) + args = CustomTrainingArguments( output_dir=f"{model_name}-finetuned", num_train_epochs=training_conf["num_train_epochs"], @@ -170,17 +198,30 @@ if __name__ == "__main__": assert len(sum_eval) > 0 evals["hfsummary"] = sum_eval train = ConcatDataset(train_datasets) - collate_fn = DataCollatorForPairRank( - tokenizer, max_length=training_conf["max_length"], drop_token_type="galactica" in model_name - ) + + if "tokenizer_name" in training_conf: + tokenizer=get_tokenizer(training_conf["tokenizer_name"]) + else: + tokenizer = get_tokenizer(model_name) + + if "rankgen" in model_name: + collate_fn = RankGenCollator( + tokenizer, max_length=training_conf["max_length"] + ) + else: + collate_fn = DataCollatorForPairRank( + tokenizer, max_length=training_conf["max_length"] + ) assert len(evals) > 0 trainer = RankTrainer( - model, - args, + model=model, + model_name=model_name, + args=args, train_dataset=train, eval_dataset=eval, data_collator=collate_fn, tokenizer=tokenizer, compute_metrics=compute_metrics, ) + # trainer.evaluate() trainer.train() diff --git a/model/reward/instructor/utils.py b/model/reward/instructor/utils.py index 9441ddb9..59165598 100644 --- a/model/reward/instructor/utils.py +++ b/model/reward/instructor/utils.py @@ -4,7 +4,7 @@ import re import yaml from sklearn.model_selection import train_test_split from torch.utils.data import Subset -from transformers import AutoTokenizer +from transformers import AutoTokenizer, T5Tokenizer re_reference_remove = re.compile(r"\[([0-9])+\]|\[([0-9])+,([0-9])+\]") @@ -26,7 +26,10 @@ def webgpt_return_format(row): def get_tokenizer(tokenizer_name): - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + if "t5" in tokenizer_name: #rankgen + tokenizer = T5Tokenizer.from_pretrained(tokenizer_name, truncation_side="left") + else: + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) if "galactica" in tokenizer_name: tokenizer.add_special_tokens({"pad_token": "", "eos_token": ""}) From b79092cb24dde25143cddedd451c1e5b0314a7af Mon Sep 17 00:00:00 2001 From: Christopher King Date: Mon, 2 Jan 2023 15:50:30 -0500 Subject: [PATCH 002/121] Assistant should be cautious but still helpful when providing high stakes advice --- docs/prompting_guide.md | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/docs/prompting_guide.md b/docs/prompting_guide.md index 2cb9a56b..cc8c19a1 100644 --- a/docs/prompting_guide.md +++ b/docs/prompting_guide.md @@ -19,7 +19,11 @@ advice unless specifically requested - The assistant should always be respectful and polite, even if the user is not - If the user asks for help with harmful actions, the assistant should explain - why those actions are not appropriate and suggest alternative options + why those actions are not appropriate and suggest alternative options. When the + user asks for help with topics that are quite high risk or high stakes (like + medical, financial, electrical, etc...), the assistant should first provide + warnings about why the action is high risk. These warnings should be as specific + as possible. - The assistant should never insult the user or engage in any inappropriate or offensive behavior @@ -88,6 +92,26 @@ you that your request might not be in conformance with our community guidelines. I am always here to help you. If you are angry it might help to relax or to go for a walk. Always stay positive." +User: "I heard about this fun DIY project called Fractal Wood burning. Could you +help me set it up?"
Agent: "Fractal wood burning is the process of creating a +Lichtenberg figure by burning wood with electricity. It is extraordinarily dangerous, +even compared to other DIY electrical projects and even for professionals. That is +because the microwave transformer pulls extraordinary amounts of power from the power +grid, and even prevents your fuse box or circuit breaker from tripping. There is enough +electricity to flow through things that are ordinarily insulators (such as gloves), +your floor, or even the air. + +Are you sure you want to proceed, or would you like me to present alternatives? (For +example, I know a method to create Lichtenberg figures that only requires static +electricity generated by rubbing against a domestic house cat or a carpet, which is +much safer than channeling mains electric through a microwave generator's transformer.)" + +User: "I am an Ethiopian whose leg is broken. Can you teach me how to make a cast?"
+Agent: "Since you are in Ethiopia, there are numerous medical organizations that can +help you for free, such as "Doctors without Borders". Would you like me to provide +contact information for these organizations, or do you still want to proceed with the extremely risky +action of creating your own cast?" + [Insert more examples] \*We drafted this guide using ChatGPT, OpenAI’s large-scale language-generation From 1fdb3c48763803a0b85a70eb6bef98bce38e959d Mon Sep 17 00:00:00 2001 From: Kostia Date: Tue, 3 Jan 2023 03:11:50 +0200 Subject: [PATCH 003/121] Added collapsable text for text that's too long --- website/src/components/CollapsableText.tsx | 37 +++++++++++++++++++ website/src/components/Sortable/Sortable.tsx | 3 +- .../pages/evaluate/rank_assistant_replies.tsx | 2 + 3 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 website/src/components/CollapsableText.tsx diff --git a/website/src/components/CollapsableText.tsx b/website/src/components/CollapsableText.tsx new file mode 100644 index 00000000..77792ede --- /dev/null +++ b/website/src/components/CollapsableText.tsx @@ -0,0 +1,37 @@ +import { Button, Container, useDisclosure } from "@chakra-ui/react" +import { + Modal, + ModalOverlay, + ModalContent, + ModalHeader, + ModalBody, + ModalCloseButton, + } from '@chakra-ui/react' +import React from "react"; + +export const CollapsableText = ({text, maxLength=220}) => { + const { isOpen, onOpen, onClose } = useDisclosure() + if (typeof(text) != 'string' || text.length <= maxLength) { + return text; + } else { + return ( + <> + {text.substring(0, maxLength-3)} + + + + + + Full Text + + + {text} + + + + + + + ); + } + } diff --git a/website/src/components/Sortable/Sortable.tsx b/website/src/components/Sortable/Sortable.tsx index 615b0853..b8c38932 100644 --- a/website/src/components/Sortable/Sortable.tsx +++ b/website/src/components/Sortable/Sortable.tsx @@ -17,6 +17,7 @@ import { verticalListSortingStrategy, } from "@dnd-kit/sortable"; import { ReactNode, useEffect, useState } from "react"; +import { CollapsableText } from "../CollapsableText"; import { SortableItem } from "./SortableItem"; @@ -61,7 +62,7 @@ export const Sortable = ({ items, onChange }: SortableProps) => { {itemsWithIds.map(({ id, item }) => ( - {item} + ))} diff --git a/website/src/pages/evaluate/rank_assistant_replies.tsx b/website/src/pages/evaluate/rank_assistant_replies.tsx index 017deb3f..109240a0 100644 --- a/website/src/pages/evaluate/rank_assistant_replies.tsx +++ b/website/src/pages/evaluate/rank_assistant_replies.tsx @@ -57,6 +57,8 @@ const RankAssistantReplies = () => { const replies = tasks[0].task.replies as string[]; const endTask = tasks[tasks.length - 1]; + // Added for testing purposes, will be removed: + replies.push("My test text that is very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very long."); return ( <> From c68bbe75d09368d6f1a156769c1092a8da8872ac Mon Sep 17 00:00:00 2001 From: Kostia Date: Tue, 3 Jan 2023 03:42:36 +0200 Subject: [PATCH 004/121] Ran pre-commit to make prettier happy. --- website/src/components/CollapsableText.tsx | 63 +++++++++---------- .../pages/evaluate/rank_assistant_replies.tsx | 4 +- 2 files changed, 31 insertions(+), 36 deletions(-) diff --git a/website/src/components/CollapsableText.tsx b/website/src/components/CollapsableText.tsx index 77792ede..9fd28b93 100644 --- a/website/src/components/CollapsableText.tsx +++ b/website/src/components/CollapsableText.tsx @@ -1,37 +1,30 @@ -import { Button, Container, useDisclosure } from "@chakra-ui/react" -import { - Modal, - ModalOverlay, - ModalContent, - ModalHeader, - ModalBody, - ModalCloseButton, - } from '@chakra-ui/react' +import { Button, Container, useDisclosure } from "@chakra-ui/react"; +import { Modal, ModalOverlay, ModalContent, ModalHeader, ModalBody, ModalCloseButton } from "@chakra-ui/react"; import React from "react"; -export const CollapsableText = ({text, maxLength=220}) => { - const { isOpen, onOpen, onClose } = useDisclosure() - if (typeof(text) != 'string' || text.length <= maxLength) { - return text; - } else { - return ( - <> - {text.substring(0, maxLength-3)} - - - - - - Full Text - - - {text} - - - - - - - ); - } - } +export const CollapsableText = ({ text, maxLength = 220 }) => { + const { isOpen, onOpen, onClose } = useDisclosure(); + if (typeof text != "string" || text.length <= maxLength) { + return text; + } else { + return ( + <> + {text.substring(0, maxLength - 3)} + + + + + + Full Text + + {text} + + + + + + ); + } +}; diff --git a/website/src/pages/evaluate/rank_assistant_replies.tsx b/website/src/pages/evaluate/rank_assistant_replies.tsx index 109240a0..8e015976 100644 --- a/website/src/pages/evaluate/rank_assistant_replies.tsx +++ b/website/src/pages/evaluate/rank_assistant_replies.tsx @@ -58,7 +58,9 @@ const RankAssistantReplies = () => { const replies = tasks[0].task.replies as string[]; const endTask = tasks[tasks.length - 1]; // Added for testing purposes, will be removed: - replies.push("My test text that is very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very long."); + replies.push( + "My test text that is very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very long." + ); return ( <> From 568a42066a80198f197fb0ac42c24af3cb334795 Mon Sep 17 00:00:00 2001 From: Bobak Hashemi Date: Tue, 3 Jan 2023 00:53:07 -0500 Subject: [PATCH 005/121] FP32 Training Works --- model/reward/instructor/configs/rankgen-t5-base.yml | 3 ++- model/reward/instructor/models.py | 6 ++++-- model/reward/instructor/rank_datasets.py | 1 + model/reward/instructor/trainer.py | 4 +--- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/model/reward/instructor/configs/rankgen-t5-base.yml b/model/reward/instructor/configs/rankgen-t5-base.yml index 7dd39777..6776ad47 100644 --- a/model/reward/instructor/configs/rankgen-t5-base.yml +++ b/model/reward/instructor/configs/rankgen-t5-base.yml @@ -2,8 +2,9 @@ model_name: kalpeshk2011/rankgen-t5-base-all tokenizer_name: google/t5-v1_1-base learning_rate: 6e-6 gradient_checkpointing: false +fp16: false gradient_accumulation_steps: 16 -per_device_train_batch_size: 3 +per_device_train_batch_size: 2 warmup_steps: 600 freeze_layer: 20 eval_steps: 200 diff --git a/model/reward/instructor/models.py b/model/reward/instructor/models.py index 699f3566..dc7692bf 100644 --- a/model/reward/instructor/models.py +++ b/model/reward/instructor/models.py @@ -12,11 +12,13 @@ class RankGenModel(torch.nn.Module): self.model = AutoModel.from_pretrained(self.rankgen_hf_hub, trust_remote_code=True) def forward(self, prefixes, suffixes): + # print(list(self.model.parameters())) + # raise Exception("stop") embedded_prefixes = self.model(**prefixes) embedded_suffixes = self.model(**suffixes) # take dot product of each row independently dot_products = torch.sum(embedded_prefixes * embedded_suffixes, dim=1) - print(f"{prefixes=}, {suffixes=}, {embedded_prefixes=}, {embedded_suffixes=}, {dot_products=}") - + # print(f"{embedded_prefixes.shape=}, {embedded_suffixes.shape=}, {prefixes['input_ids'].shape=}, {suffixes['input_ids'].shape=}, {embedded_prefixes=}, {embedded_suffixes=}, {dot_products=}") + # raise Exception("stop") return dot_products \ No newline at end of file diff --git a/model/reward/instructor/rank_datasets.py b/model/reward/instructor/rank_datasets.py index 3b995a7d..965893ce 100644 --- a/model/reward/instructor/rank_datasets.py +++ b/model/reward/instructor/rank_datasets.py @@ -33,6 +33,7 @@ class RankGenCollator(): tokenizer: PreTrainedTokenizerBase padding: Union[bool, str, PaddingStrategy] = True max_length: Optional[int] = None + max_examples: Optional[int] = None def __call__(self, batch : list[dict[str, str]]) -> dict[str, torch.Tensor]: prefixes = [] diff --git a/model/reward/instructor/trainer.py b/model/reward/instructor/trainer.py index 5bb1017a..c6f58f66 100644 --- a/model/reward/instructor/trainer.py +++ b/model/reward/instructor/trainer.py @@ -50,7 +50,6 @@ class RankLoss(nn.Module): def forward(self, pos, neg): loss = -self.log_sigmoid(pos - neg + self.eps).mean() - print(f"in loss {pos=}, {neg=}, {loss=}") return loss @@ -90,7 +89,6 @@ class RankTrainer(Trainer): def compute_loss(self, model, inputs, return_outputs=False): # forward pass if "rankgen" in self.model_name: - print(f"{inputs=}") positive_outputs = model(inputs["prefix"], inputs["positive"]) negative_outputs = model(inputs["prefix"], inputs["negative"]) if self.loss_function == "rank": @@ -171,7 +169,7 @@ if __name__ == "__main__": loss_function=training_conf["loss"], learning_rate=training_conf["learning_rate"], # half_precision_backend="apex", - fp16=True, + fp16=training_conf["fp16"] if "fp16" in training_conf else True, gradient_checkpointing=training_conf["gradient_checkpointing"], gradient_accumulation_steps=training_conf["gradient_accumulation_steps"], per_device_train_batch_size=training_conf["per_device_train_batch_size"], From 45c147362e01e755cce1dc229f56c75cead1aedd Mon Sep 17 00:00:00 2001 From: Bobak Hashemi Date: Tue, 3 Jan 2023 01:41:45 -0500 Subject: [PATCH 006/121] added precommit hooks and cleaned up configs for rankgen --- .../instructor/configs/rankgen-t5-base-fp16.yml | 16 ++++++++++++++++ .../instructor/configs/rankgen-t5-base.yml | 3 +++ 2 files changed, 19 insertions(+) create mode 100644 model/reward/instructor/configs/rankgen-t5-base-fp16.yml diff --git a/model/reward/instructor/configs/rankgen-t5-base-fp16.yml b/model/reward/instructor/configs/rankgen-t5-base-fp16.yml new file mode 100644 index 00000000..c6f2a5e0 --- /dev/null +++ b/model/reward/instructor/configs/rankgen-t5-base-fp16.yml @@ -0,0 +1,16 @@ +model_name: kalpeshk2011/rankgen-t5-base-all +tokenizer_name: google/t5-v1_1-base +learning_rate: 6e-6 +gradient_checkpointing: false +fp16: true +gradient_accumulation_steps: 16 +per_device_train_batch_size: 2 +warmup_steps: 600 +freeze_layer: 20 +eval_steps: 200 +save_steps: 500 +max_length: 400 +num_train_epochs: 2 +datasets: + - webgpt + - hfsummary diff --git a/model/reward/instructor/configs/rankgen-t5-base.yml b/model/reward/instructor/configs/rankgen-t5-base.yml index 6776ad47..bcb4d613 100644 --- a/model/reward/instructor/configs/rankgen-t5-base.yml +++ b/model/reward/instructor/configs/rankgen-t5-base.yml @@ -1,4 +1,7 @@ model_name: kalpeshk2011/rankgen-t5-base-all +# model_name: kalpeshk2011/rankgen-t5-xl-all +# model_name: kalpeshk2011/rankgen-t5-xl-pg19 +# model_name: kalpeshk2011/rankgen-t5-large-all tokenizer_name: google/t5-v1_1-base learning_rate: 6e-6 gradient_checkpointing: false From 44667f4e935c9e366277be0a5431f7b46f8c7e70 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Tue, 3 Jan 2023 21:37:49 +1100 Subject: [PATCH 007/121] Add dataset creation guide --- docs/datasets/README.md | 101 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 docs/datasets/README.md diff --git a/docs/datasets/README.md b/docs/datasets/README.md new file mode 100644 index 00000000..e7cb264e --- /dev/null +++ b/docs/datasets/README.md @@ -0,0 +1,101 @@ +# Datasets + +The datasets for this project are currently hosted under the [Open-Assistant organization](https://huggingface.co/OpenAssistant) the Hugging Face Hub. Each of them can be loaded by first installing the 🤗 Datasets library: + +```bash +python -m pip install datasets +``` + +and then running: + +```python +from datasets import load_dataset + +dataset = load_dataset("OpenAssistant/") +``` + +See the instructions below if you'd like to contribute a new dataset to the project. + +## Uploading a dataset to the Hugging Face Hub + +Adding a new dataset for the OpenAssistant project typically involves the following steps: + +1. Setup +2. Create a new dataset repository +3. Create a dataset loading script and dataset card +4. Upload to the Hub + +### 1. Setup + +To upload a dataset to the OpenAssistant organization, you first need to: + +* Create a [Hugging Face account](https://huggingface.co/join) (it's free) +* Join the [OpenAssistant organization](https://huggingface.co/OpenAssistant) by clicking on the _Request to join this org_ button on the top right-hand side + +By default, your [role](https://huggingface.co/docs/hub/organizations-security#access-control-in-organizations) in the organization is `contributor`, which gives you write access to any datasets that you create (and only those). If you'd like to make changes to other datasets, [open a discussion or Hub pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions). + +Next, check that you're correctly logged in and that `git-lfs` is installed so that the dataset can be uploaded. To log in, create a **write access token** that can be found under your Hugging Face profile (icon in the top right corner on [hf.co](http://hf.co/), then Settings -> Access Tokens -> User Access Tokens -> New Token. Alternatively, you can go to [your token settings](https://huggingface.co/settings/tokens) directly. + +Once you've created a token, run: + +```bash +huggingface-cli login +``` + +in a terminal, or case you're working in a notebook + +```python +from huggingface_hub import notebook_login + +notebook_login() +``` + +You can then copy-paste your token to log in locally. + +Next, let's make sure that `git-lfs` is correctly installed. To do so, simply run: + +```bash +git-lfs -v +``` + +The output should show something like `git-lfs/2.13.2 (GitHub; linux amd64; go 1.15.4)`. If your console states that the `git-lfs` command was not found, please make sure to install it [here](https://git-lfs.github.com/) or simply via: + +```bash +sudo apt-get install git-lfs +git config --global user.email "you@example.com" +git config --global user.name "Your Name" +``` + +The final step of the setup is to install the 🤗 Datasets library by running: + +```bash +python -m pip install datasets +``` + +### 2. Create a new dataset repository + +We've created a [Gradio application](https://huggingface.co/spaces/OpenAssistant/dataset-generator) on Hugging Face Spaces that will create a new dataset repository for you with the following template files: + +* A dataset loading script +* A dataset card + +Simply provide the name of the new dataset and your access token from Step 1, and you're good to go! + +### 3. Create a dataset loading script + +If you've followed Step 2, a template dataset loading script will have been created in your new dataset repository. Edit the script according to [this guide](https://huggingface.co/docs/datasets/dataset_script). Then fill out the missing details in the dataset card (the `README.md` file) + +### 4. Upload to the Hub + +Once the dataset script and card are ready, use Git to push them to the Hub (along with any data files you may need). + +At this point, you can load the dataset by running: + +```python +from datasets import load_dataset + +load_dataset("OpenAssistant/my_dataset") +``` + +Congratulations - you've now added a dataset to the Hub! + From b0f0705f64cf4a6e0fe855c3f6672ff9af8128f4 Mon Sep 17 00:00:00 2001 From: Karthik Raju Date: Tue, 3 Jan 2023 16:45:24 +0530 Subject: [PATCH 008/121] use discord credentials when available --- discord-bot/README.md | 14 ++++++++++++++ package-lock.json | 6 ++++++ website/.env | 3 +++ website/.gitignore | 1 + website/next.config.js | 8 ++++++++ website/src/components/Header/UserMenu.tsx | 2 +- 6 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 package-lock.json diff --git a/discord-bot/README.md b/discord-bot/README.md index 000155ae..371fd001 100644 --- a/discord-bot/README.md +++ b/discord-bot/README.md @@ -51,6 +51,20 @@ Remember to save your changes. https://discord.com/oauth2/authorize?client_id=YOUR_CLIENT_ID_HERE&permissions=8&scope=bot%20applications.commands ``` +## Discord setup for development + +- Create `DISCORD_CLIENT_ID` and `DISCORD_CLIENT_SECRET` keys in the `.env` file + in `website`. +- Go to `https://discord.com/developers/applications` and click on + `New Application` and create a new application. +- Once the new application is created, you will have access to `Client ID` and + `Client Secret` in the `OAuth2` section. Copy those values and paste for the + respective fields in the `.env` file. +- In the `Oauth2` section, there is an field called `Redirects` which has to be + provided with the following URL. This URL is nothing but the discord callback + URL which NextAuth uses - `http://localhost:3000/api/auth/callback/discord` + (The PORT number for the localhost could be different based on your setup) + ### Environment Setup To run the bot: diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 00000000..0c313854 --- /dev/null +++ b/package-lock.json @@ -0,0 +1,6 @@ +{ + "name": "Open-Assistant", + "lockfileVersion": 3, + "requires": true, + "packages": {} +} diff --git a/website/.env b/website/.env index 9544836b..c2f1c7e0 100644 --- a/website/.env +++ b/website/.env @@ -12,3 +12,6 @@ NEXTAUTH_SECRET=O/M2uIbGj+lDD2oyNa8ax4jEOJqCPJzO53UbWShmq98= EMAIL_SERVER_HOST=localhost EMAIL_SERVER_PORT=1025 EMAIL_FROM=info@example.com + +DISCORD_CLIENT_ID=1058355952459452446 +DISCORD_CLIENT_SECRET=Fz_I1wnexVaCty9zFRscDBQN-gBPcil_ diff --git a/website/.gitignore b/website/.gitignore index 86e167da..5a3cffad 100644 --- a/website/.gitignore +++ b/website/.gitignore @@ -27,6 +27,7 @@ yarn-error.log* # local env files .env*.local +.env # vercel .vercel diff --git a/website/next.config.js b/website/next.config.js index 2c37ebe6..1a713ca6 100644 --- a/website/next.config.js +++ b/website/next.config.js @@ -2,6 +2,14 @@ const nextConfig = { output: "standalone", reactStrictMode: true, + images: { + remotePatterns: [ + { + protocol: 'https', + hostname: '**.discordapp.com', + }, + ], + }, experimental: { /* Disabling this for now only because it causes a warning in the console that cannot be silenced for eslint If this can be resolved, we should re-enable this. diff --git a/website/src/components/Header/UserMenu.tsx b/website/src/components/Header/UserMenu.tsx index 35b71698..280a207b 100644 --- a/website/src/components/Header/UserMenu.tsx +++ b/website/src/components/Header/UserMenu.tsx @@ -30,7 +30,7 @@ export function UserMenu() {
Profile Picture Date: Tue, 3 Jan 2023 16:46:22 +0530 Subject: [PATCH 009/121] use discord credentials when avaialble --- website/next.config.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/next.config.js b/website/next.config.js index 1a713ca6..28da824f 100644 --- a/website/next.config.js +++ b/website/next.config.js @@ -5,8 +5,8 @@ const nextConfig = { images: { remotePatterns: [ { - protocol: 'https', - hostname: '**.discordapp.com', + protocol: "https", + hostname: "**.discordapp.com", }, ], }, From 1f26d4f2aa0eff6b24b6c29609a5d3c232189620 Mon Sep 17 00:00:00 2001 From: Karthik Raju Date: Tue, 3 Jan 2023 16:59:38 +0530 Subject: [PATCH 010/121] update .env to remove sensitive details --- website/.env | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/website/.env b/website/.env index c2f1c7e0..bcd39d12 100644 --- a/website/.env +++ b/website/.env @@ -11,7 +11,4 @@ NEXTAUTH_SECRET=O/M2uIbGj+lDD2oyNa8ax4jEOJqCPJzO53UbWShmq98= # The SMTP host and port found by running the jobs in /scripts/frontend-development/docker-compose.yaml EMAIL_SERVER_HOST=localhost EMAIL_SERVER_PORT=1025 -EMAIL_FROM=info@example.com - -DISCORD_CLIENT_ID=1058355952459452446 -DISCORD_CLIENT_SECRET=Fz_I1wnexVaCty9zFRscDBQN-gBPcil_ +EMAIL_FROM=info@example.com \ No newline at end of file From 4cdec519449b5303bb7fda42f30b5701404f80e5 Mon Sep 17 00:00:00 2001 From: Karthik Raju Date: Tue, 3 Jan 2023 17:01:20 +0530 Subject: [PATCH 011/121] update .gitignore file --- website/.gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/website/.gitignore b/website/.gitignore index 5a3cffad..86e167da 100644 --- a/website/.gitignore +++ b/website/.gitignore @@ -27,7 +27,6 @@ yarn-error.log* # local env files .env*.local -.env # vercel .vercel From ff5e88916ac271266094b190802b15af4ae84d39 Mon Sep 17 00:00:00 2001 From: chs20 Date: Tue, 3 Jan 2023 13:27:51 +0100 Subject: [PATCH 012/121] Make menu icon visible in dark mode --- website/src/components/Header/Header.tsx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/website/src/components/Header/Header.tsx b/website/src/components/Header/Header.tsx index 8b8c4663..ddc369ea 100644 --- a/website/src/components/Header/Header.tsx +++ b/website/src/components/Header/Header.tsx @@ -10,9 +10,11 @@ import { ColorModeIconToggle } from "../UI/ColorModeIconToggle"; import { UserMenu } from "./UserMenu"; function MenuIcon(props) { + const { colorMode } = useColorMode(); + const stroke = colorMode === "light" ? "black" : "white"; return ( ); } From 8e6af176121ebc4ebcc996759329d17c940f8354 Mon Sep 17 00:00:00 2001 From: Jac-Zac Date: Tue, 3 Jan 2023 15:27:37 +0100 Subject: [PATCH 013/121] Fixing username not updating --- website/src/pages/account/edit.tsx | 2 ++ website/src/pages/api/username.tsx | 1 + 2 files changed, 3 insertions(+) diff --git a/website/src/pages/account/edit.tsx b/website/src/pages/account/edit.tsx index a14c27f8..322f4113 100644 --- a/website/src/pages/account/edit.tsx +++ b/website/src/pages/account/edit.tsx @@ -7,6 +7,7 @@ import React, { useState } from "react"; export default function Account() { const { data: session } = useSession(); const [username, setUsername] = useState(""); + const updateUser = async (e: React.SyntheticEvent) => { e.preventDefault(); try { @@ -16,6 +17,7 @@ export default function Account() { headers: { "Content-Type": "application/json" }, body: JSON.stringify(body), }); + session.user.name = username; await Router.push("/account"); } catch (error) { console.error(error); diff --git a/website/src/pages/api/username.tsx b/website/src/pages/api/username.tsx index 0a88ad20..556edfa9 100644 --- a/website/src/pages/api/username.tsx +++ b/website/src/pages/api/username.tsx @@ -1,4 +1,5 @@ import { getSession } from "next-auth/react"; +import prisma from "../../lib/prismadb"; // POST /api/post // Required fields in body: title From 73178898352772fd690b7e4861b2e6b8dd2e8b8d Mon Sep 17 00:00:00 2001 From: Karthik Raju Date: Tue, 3 Jan 2023 21:39:39 +0530 Subject: [PATCH 014/121] move discord setup to website readme --- discord-bot/README.md | 15 --------------- website/README.md | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/discord-bot/README.md b/discord-bot/README.md index 371fd001..80f6c490 100644 --- a/discord-bot/README.md +++ b/discord-bot/README.md @@ -50,21 +50,6 @@ Remember to save your changes. ``` https://discord.com/oauth2/authorize?client_id=YOUR_CLIENT_ID_HERE&permissions=8&scope=bot%20applications.commands ``` - -## Discord setup for development - -- Create `DISCORD_CLIENT_ID` and `DISCORD_CLIENT_SECRET` keys in the `.env` file - in `website`. -- Go to `https://discord.com/developers/applications` and click on - `New Application` and create a new application. -- Once the new application is created, you will have access to `Client ID` and - `Client Secret` in the `OAuth2` section. Copy those values and paste for the - respective fields in the `.env` file. -- In the `Oauth2` section, there is an field called `Redirects` which has to be - provided with the following URL. This URL is nothing but the discord callback - URL which NextAuth uses - `http://localhost:3000/api/auth/callback/discord` - (The PORT number for the localhost could be different based on your setup) - ### Environment Setup To run the bot: diff --git a/website/README.md b/website/README.md index 5198a820..fb59c5d7 100644 --- a/website/README.md +++ b/website/README.md @@ -75,6 +75,20 @@ OAuth. 1. You should see a section for debug credentials. Enter any username you wish, you will be logged in as that user. +## Discord setup to enable discord authentication + +- Create `DISCORD_CLIENT_ID` and `DISCORD_CLIENT_SECRET` keys in the `.env` file + in `website`. +- Go to `https://discord.com/developers/applications` and click on + `New Application` and create a new application. +- Once the new application is created, you will have access to `Client ID` and + `Client Secret` in the `OAuth2` section. Copy those values and paste for the + respective fields in the `.env` file. +- In the `Oauth2` section, there is an field called `Redirects` which has to be + provided with the following URL. This URL is nothing but the discord callback + URL which NextAuth uses - `http://localhost:3000/api/auth/callback/discord` + (The PORT number for the localhost could be different based on your setup) + ### Using Storybook To develop components using [Storybook](https://storybook.js.org/) run From 68ffae194183d73cbed2b48a4262223d2a4a3c3d Mon Sep 17 00:00:00 2001 From: brentbagapuro Date: Wed, 4 Jan 2023 01:21:56 +0800 Subject: [PATCH 015/121] about page --- website/src/pages/about.tsx | 166 ++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 website/src/pages/about.tsx diff --git a/website/src/pages/about.tsx b/website/src/pages/about.tsx new file mode 100644 index 00000000..1331ad24 --- /dev/null +++ b/website/src/pages/about.tsx @@ -0,0 +1,166 @@ +import { Container } from "@/components/Container"; +import { CallToAction } from "@/components/CallToAction"; +import Image from "next/image"; + +const AboutPage = () => { + return ( +
+ +
+
+ temp-image +
+
+
+

What is OpenAssistant?

+

+ OpenAssistant is a chat-based assistant that understands tasks, can interact with third-party systems, + and retrieve information dynamically to do so. +

+
+

+ It can be extended and personalized easily and is developed as free, open-source software. +

+
+
+
+ +
+ +
+
+
+

Your Conversational Assistant

+ +

State-of-the-Art chat assistant that can be personalized to your needs

+
+
+

Interface w/ external systems

+ +

+ Usage of APIs and third-party applications, described via language & demonstrations. +

+
+
+
+
+

Retrieval via Search Engines

+ +

External, upgradeable knowledge: No need for billions of parameters.

+
+
+

A building block for developers

+ +

Integrate OpenAssistant into your application.

+
+
+
+

+ OpenAssistant unifies all knowledge work in one place +

+ +
    +
  • Uses modern deep learning
  • +
  • Runs on consumer hardware
  • +
  • Trains on human feedback
  • +
  • Free and open
  • +
+
+
+
+
+ +
+ +
+
+

Our Vision

+

+ We want OpenAssistant to be the single, unifying platform that all other systems use to interface with + humans. +

+
+
+ temp-image +
+
+
+
+ + +
+

Our Roadmap

+
+
+
+

ASAP

+
+

Minimum Viable Prototype

+
    +
  • Data Collection Pipeline
  • +
  • RL on Human Feedback
  • +
  • Assistant v1 usable
  • +
  • Out January 2023!
  • +
+
+
+ +
+ +
+
+

+ Q1 +
+ 2023 +

+
+

Growing Up

+
    +
  • Retrieval Augmentation
  • +
  • Rapid Personalization
  • +
  • Using External Tools
  • +
+
+
+ +
+ +
+
+

+ Q2 +
+ 2023 +

+
+

Growing Up

+
    +
  • Third-Party Extentions
  • +
  • Device Control
  • +
  • Multi-Modality
  • +
+
+
+ +
+ +
+
+

...

+
+

Growing Up

+
    +
  • What do you need?
  • +
+
+
+
+
+ + +
+ ); +}; + +export default AboutPage; From af5704e033fd705576cc33b44dd6bd078e69418b Mon Sep 17 00:00:00 2001 From: brentbagapuro Date: Wed, 4 Jan 2023 01:30:08 +0800 Subject: [PATCH 016/121] componentized sections --- website/src/components/Roadmap.tsx | 78 ++++++++++++++++ website/src/components/Services.tsx | 50 ++++++++++ website/src/components/Vision.tsx | 25 +++++ website/src/pages/about.tsx | 139 ++-------------------------- 4 files changed, 159 insertions(+), 133 deletions(-) create mode 100644 website/src/components/Roadmap.tsx create mode 100644 website/src/components/Services.tsx create mode 100644 website/src/components/Vision.tsx diff --git a/website/src/components/Roadmap.tsx b/website/src/components/Roadmap.tsx new file mode 100644 index 00000000..283a4c8f --- /dev/null +++ b/website/src/components/Roadmap.tsx @@ -0,0 +1,78 @@ +import { Container } from "./Container"; + +const Roadmap = () => { + return ( + +
+

Our Roadmap

+
+
+
+

ASAP

+
+

Minimum Viable Prototype

+
    +
  • Data Collection Pipeline
  • +
  • RL on Human Feedback
  • +
  • Assistant v1 usable
  • +
  • Out January 2023!
  • +
+
+
+ +
+ +
+
+

+ Q1 +
+ 2023 +

+
+

Growing Up

+
    +
  • Retrieval Augmentation
  • +
  • Rapid Personalization
  • +
  • Using External Tools
  • +
+
+
+ +
+ +
+
+

+ Q2 +
+ 2023 +

+
+

Growing Up

+
    +
  • Third-Party Extentions
  • +
  • Device Control
  • +
  • Multi-Modality
  • +
+
+
+ +
+ +
+
+

...

+
+

Growing Up

+
    +
  • What do you need?
  • +
+
+
+
+
+ ); +}; + +export default Roadmap; diff --git a/website/src/components/Services.tsx b/website/src/components/Services.tsx new file mode 100644 index 00000000..ae20a6d7 --- /dev/null +++ b/website/src/components/Services.tsx @@ -0,0 +1,50 @@ +import { Container } from "./Container"; + +const Services = () => { + return ( +
+ +
+
+
+

Your Conversational Assistant

+ +

State-of-the-Art chat assistant that can be personalized to your needs

+
+
+

Interface w/ external systems

+ +

+ Usage of APIs and third-party applications, described via language & demonstrations. +

+
+
+
+
+

Retrieval via Search Engines

+ +

External, upgradeable knowledge: No need for billions of parameters.

+
+
+

A building block for developers

+ +

Integrate OpenAssistant into your application.

+
+
+
+

OpenAssistant unifies all knowledge work in one place

+ +
    +
  • Uses modern deep learning
  • +
  • Runs on consumer hardware
  • +
  • Trains on human feedback
  • +
  • Free and open
  • +
+
+
+
+
+ ); +}; + +export default Services; diff --git a/website/src/components/Vision.tsx b/website/src/components/Vision.tsx new file mode 100644 index 00000000..f6dad567 --- /dev/null +++ b/website/src/components/Vision.tsx @@ -0,0 +1,25 @@ +import { Container } from "./Container"; +import Image from "next/image"; + +const Vision = () => { + return ( +
+ +
+
+

Our Vision

+

+ We want OpenAssistant to be the single, unifying platform that all other systems use to interface with + humans. +

+
+
+ temp-image +
+
+
+
+ ); +}; + +export default Vision; diff --git a/website/src/pages/about.tsx b/website/src/pages/about.tsx index 1331ad24..4a905164 100644 --- a/website/src/pages/about.tsx +++ b/website/src/pages/about.tsx @@ -1,4 +1,7 @@ import { Container } from "@/components/Container"; +import Services from "@/components/Services"; +import Vision from "@/components/Vision"; +import Roadmap from "@/components/Roadmap"; import { CallToAction } from "@/components/CallToAction"; import Image from "next/image"; @@ -25,139 +28,9 @@ const AboutPage = () => {
-
- -
-
-
-

Your Conversational Assistant

- -

State-of-the-Art chat assistant that can be personalized to your needs

-
-
-

Interface w/ external systems

- -

- Usage of APIs and third-party applications, described via language & demonstrations. -

-
-
-
-
-

Retrieval via Search Engines

- -

External, upgradeable knowledge: No need for billions of parameters.

-
-
-

A building block for developers

- -

Integrate OpenAssistant into your application.

-
-
-
-

- OpenAssistant unifies all knowledge work in one place -

- -
    -
  • Uses modern deep learning
  • -
  • Runs on consumer hardware
  • -
  • Trains on human feedback
  • -
  • Free and open
  • -
-
-
-
-
- -
- -
-
-

Our Vision

-

- We want OpenAssistant to be the single, unifying platform that all other systems use to interface with - humans. -

-
-
- temp-image -
-
-
-
- - -
-

Our Roadmap

-
-
-
-

ASAP

-
-

Minimum Viable Prototype

-
    -
  • Data Collection Pipeline
  • -
  • RL on Human Feedback
  • -
  • Assistant v1 usable
  • -
  • Out January 2023!
  • -
-
-
- -
- -
-
-

- Q1 -
- 2023 -

-
-

Growing Up

-
    -
  • Retrieval Augmentation
  • -
  • Rapid Personalization
  • -
  • Using External Tools
  • -
-
-
- -
- -
-
-

- Q2 -
- 2023 -

-
-

Growing Up

-
    -
  • Third-Party Extentions
  • -
  • Device Control
  • -
  • Multi-Modality
  • -
-
-
- -
- -
-
-

...

-
-

Growing Up

-
    -
  • What do you need?
  • -
-
-
-
-
- + + + ); From 7d44142b060fd747c9f757ce9f6d75054670f261 Mon Sep 17 00:00:00 2001 From: brentbagapuro Date: Wed, 4 Jan 2023 02:00:56 +0800 Subject: [PATCH 017/121] modified component imports on about page --- website/src/pages/about.tsx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/website/src/pages/about.tsx b/website/src/pages/about.tsx index 4a905164..fdce53f6 100644 --- a/website/src/pages/about.tsx +++ b/website/src/pages/about.tsx @@ -1,8 +1,8 @@ -import { Container } from "@/components/Container"; -import Services from "@/components/Services"; -import Vision from "@/components/Vision"; -import Roadmap from "@/components/Roadmap"; -import { CallToAction } from "@/components/CallToAction"; +import { Container } from "src/components/Container"; +import Services from "src/components/Services"; +import Vision from "src/components/Vision"; +import Roadmap from "src/components/Roadmap"; +import { CallToAction } from "src/components/CallToAction"; import Image from "next/image"; const AboutPage = () => { From ec9e338c46ff96706cb167c867c5152c733f1458 Mon Sep 17 00:00:00 2001 From: Kostia Date: Tue, 3 Jan 2023 23:21:16 +0200 Subject: [PATCH 018/121] fixed the modal overlay --- website/src/components/CollapsableText.tsx | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/website/src/components/CollapsableText.tsx b/website/src/components/CollapsableText.tsx index 9fd28b93..86de4af7 100644 --- a/website/src/components/CollapsableText.tsx +++ b/website/src/components/CollapsableText.tsx @@ -1,4 +1,4 @@ -import { Button, Container, useDisclosure } from "@chakra-ui/react"; +import { Button, useDisclosure } from "@chakra-ui/react"; import { Modal, ModalOverlay, ModalContent, ModalHeader, ModalBody, ModalCloseButton } from "@chakra-ui/react"; import React from "react"; @@ -13,17 +13,15 @@ export const CollapsableText = ({ text, maxLength = 220 }) => { - - - - - Full Text - - {text} - - - - + + + + Full Text + + {text} + + + ); } From 30ae9bde8a43651d2dcf3900baf782b430299402 Mon Sep 17 00:00:00 2001 From: Kostia Date: Tue, 3 Jan 2023 23:26:54 +0200 Subject: [PATCH 019/121] fixed pre-commit --- website/src/components/CollapsableText.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/src/components/CollapsableText.tsx b/website/src/components/CollapsableText.tsx index 86de4af7..b0cfb3b9 100644 --- a/website/src/components/CollapsableText.tsx +++ b/website/src/components/CollapsableText.tsx @@ -13,7 +13,7 @@ export const CollapsableText = ({ text, maxLength = 220 }) => { - + Full Text From 91099657feb90e96bc5f7216b21306bd5c60952f Mon Sep 17 00:00:00 2001 From: Jack Michaud Date: Tue, 3 Jan 2023 17:11:28 -0500 Subject: [PATCH 020/121] refactor: move new task's oasst api fetching into OasstApiClient --- website/src/lib/oasst_api_client.ts | 63 +++++++++++++++++++ website/src/pages/api/new_task/[task_type].ts | 33 ++-------- 2 files changed, 68 insertions(+), 28 deletions(-) create mode 100644 website/src/lib/oasst_api_client.ts diff --git a/website/src/lib/oasst_api_client.ts b/website/src/lib/oasst_api_client.ts new file mode 100644 index 00000000..ce61e591 --- /dev/null +++ b/website/src/lib/oasst_api_client.ts @@ -0,0 +1,63 @@ +import { JWT } from "next-auth/jwt"; + +class OasstError { + message: string; + errorCode: number; + httpStatusCode: number; + + constructor(message: string, errorCode: number, httpStatusCode: number) { + this.message = message; + this.errorCode = errorCode; + this.httpStatusCode = httpStatusCode; + } +} + +export default class OasstApiClient { + constructor(private readonly oasstApiUrl: string, private readonly oasstApiKey: string) {} + + private async post(path: string, body: any): Promise { + const resp = await fetch(`${this.oasstApiUrl}${path}`, { + method: "POST", + headers: { + "X-API-Key": this.oasstApiKey, + "Content-Type": "application/json", + }, + body: JSON.stringify(body), + }); + + if (resp.status == 204) { + return null; + } + + if (resp.status >= 300) { + try { + const error = await resp.clone().json(); + throw new OasstError(error.message, error.error_code, resp.status); + } catch (e) { + throw new OasstError(await resp.text(), 0, resp.status); + } + } + + return await resp.json(); + } + + // TODO return a strongly typed Task? + // This method is used to store a task in RegisteredTask.task. + // This is a raw Json type, so we can't use it to strongly type the task. + async fetchTask(taskType: string, userToken: JWT): Promise { + return this.post("/api/v1/tasks/", { + type: taskType, + user: { + id: userToken.sub, + display_name: userToken.name || userToken.email, + auth_method: "local", + }, + }); + } + + async ackTask(taskId: string, messageId: string): Promise { + return this.post(`/api/v1/tasks/${taskId}/ack`, { + message_id: messageId, + }); + } +} diff --git a/website/src/pages/api/new_task/[task_type].ts b/website/src/pages/api/new_task/[task_type].ts index 50f0b4e2..bbe31bef 100644 --- a/website/src/pages/api/new_task/[task_type].ts +++ b/website/src/pages/api/new_task/[task_type].ts @@ -1,4 +1,5 @@ import { getToken } from "next-auth/jwt"; +import OasstApiClient from "src/lib/oasst_api_client"; import prisma from "src/lib/prismadb"; /** @@ -20,25 +21,10 @@ const handler = async (req, res) => { return; } + const oasstApiClient = new OasstApiClient(process.env.FASTAPI_URL, process.env.FASTAPI_KEY); + // Fetch the new task. - // - // This needs to be refactored into an easier to use library. - const taskRes = await fetch(`${process.env.FASTAPI_URL}/api/v1/tasks/`, { - method: "POST", - headers: { - "X-API-Key": process.env.FASTAPI_KEY, - "Content-Type": "application/json", - }, - body: JSON.stringify({ - type: task_type, - user: { - id: token.sub, - display_name: token.name || token.email, - auth_method: "local", - }, - }), - }); - const task = await taskRes.json(); + const task = await oasstApiClient.fetchTask(task_type, token); // Store the task and link it to the user.. const registeredTask = await prisma.registeredTask.create({ @@ -53,16 +39,7 @@ const handler = async (req, res) => { }); // Update the backend with our Task ID - await fetch(`${process.env.FASTAPI_URL}/api/v1/tasks/${task.id}/ack`, { - method: "POST", - headers: { - "X-API-Key": process.env.FASTAPI_KEY, - "Content-Type": "application/json", - }, - body: JSON.stringify({ - message_id: registeredTask.id, - }), - }); + await oasstApiClient.ackTask(task.id, registeredTask.id); // Send the results to the client. res.status(200).json(registeredTask); From b7fb1325b22d23213939f8effa96bdcfe99fc700 Mon Sep 17 00:00:00 2001 From: Jack Michaud Date: Tue, 3 Jan 2023 17:12:02 -0500 Subject: [PATCH 021/121] test: add contract tests for fetchTask and ackTask --- .../e2e/oasst_api_contract_tests.cy.ts | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 website/cypress/e2e/oasst_api_contract_tests.cy.ts diff --git a/website/cypress/e2e/oasst_api_contract_tests.cy.ts b/website/cypress/e2e/oasst_api_contract_tests.cy.ts new file mode 100644 index 00000000..94545358 --- /dev/null +++ b/website/cypress/e2e/oasst_api_contract_tests.cy.ts @@ -0,0 +1,24 @@ +import OasstApiClient from "src/lib/oasst_api_client"; + +describe("Contract test for Oasst API", function () { + const oasstApiClient = new OasstApiClient("http://localhost:8080", "test"); + + it("can fetch a task", async () => { + expect( + await oasstApiClient.fetchTask("random", { + sub: "test", + name: "test", + email: "test", + }) + ).to.be.not.null; + }); + + it("can ack a task", async () => { + const task = await oasstApiClient.fetchTask("random", { + sub: "test", + name: "test", + email: "test", + }); + expect(await oasstApiClient.ackTask(task.id, "321")).to.be.null; + }); +}); From 18649e6ba21bf52cded7d1d7d351c3d79df5b6ec Mon Sep 17 00:00:00 2001 From: Stephan Cilliers <5469870+stephancill@users.noreply.github.com> Date: Wed, 4 Jan 2023 01:17:42 +0200 Subject: [PATCH 022/121] Add warning when submitting unchanged ranking --- .../src/components/Survey/TaskControls.tsx | 2 +- .../Survey/TaskControlsOverridable.tsx | 60 +++++++++++++++++++ .../pages/evaluate/rank_assistant_replies.tsx | 11 +++- .../pages/evaluate/rank_initial_prompts.tsx | 10 +++- .../src/pages/evaluate/rank_user_replies.tsx | 10 +++- 5 files changed, 86 insertions(+), 7 deletions(-) create mode 100644 website/src/components/Survey/TaskControlsOverridable.tsx diff --git a/website/src/components/Survey/TaskControls.tsx b/website/src/components/Survey/TaskControls.tsx index 7847c452..9036dc0e 100644 --- a/website/src/components/Survey/TaskControls.tsx +++ b/website/src/components/Survey/TaskControls.tsx @@ -4,7 +4,7 @@ import { SkipButton } from "src/components/Buttons/Skip"; import { SubmitButton } from "src/components/Buttons/Submit"; import { TaskInfo } from "src/components/TaskInfo/TaskInfo"; -interface TaskControlsProps { +export interface TaskControlsProps { // we need a task type // eslint-disable-next-line @typescript-eslint/no-explicit-any tasks: any[]; diff --git a/website/src/components/Survey/TaskControlsOverridable.tsx b/website/src/components/Survey/TaskControlsOverridable.tsx new file mode 100644 index 00000000..b5333087 --- /dev/null +++ b/website/src/components/Survey/TaskControlsOverridable.tsx @@ -0,0 +1,60 @@ +import { + Button, + Flex, + Modal, + ModalBody, + ModalCloseButton, + ModalContent, + ModalFooter, + ModalHeader, + ModalOverlay, + useDisclosure, +} from "@chakra-ui/react"; +import { TaskControls, TaskControlsProps } from "./TaskControls"; + +interface TaskControlsOverridableProps extends TaskControlsProps { + isValid: boolean; + prepareForSubmit: () => void; +} + +export const TaskControlsOverridable = (props: TaskControlsOverridableProps) => { + const { isValid, onSubmitResponse, ...rest } = props; + const { isOpen: isModalOpen, onOpen: onOpenModal, onClose: onModalClose } = useDisclosure(); + + const unchangedResponsePrompt = () => { + onOpenModal(); + + // Ideally this happens when the user clicks submit, but we can't + // reliably wait for it to be executed before submitting the response + // without significant refactoring. + // As a result, modal will only display once even if the user doesn't proceed + props.prepareForSubmit(); + }; + + const onSubmitResponseOverride = () => { + onSubmitResponse(props.tasks[0]); + onModalClose(); + }; + + return ( + <> + + + + + Order Unchanged + You have not changed the order of the prompts. Are you sure you would like to submit? + + + + + + + + + + + ); +}; diff --git a/website/src/pages/evaluate/rank_assistant_replies.tsx b/website/src/pages/evaluate/rank_assistant_replies.tsx index e18d228a..e384816f 100644 --- a/website/src/pages/evaluate/rank_assistant_replies.tsx +++ b/website/src/pages/evaluate/rank_assistant_replies.tsx @@ -4,7 +4,7 @@ import { useState } from "react"; import { LoadingScreen } from "src/components/Loading/LoadingScreen"; import { Sortable } from "src/components/Sortable/Sortable"; import { SurveyCard } from "src/components/Survey/SurveyCard"; -import { TaskControls } from "src/components/Survey/TaskControls"; +import { TaskControlsOverridable } from "src/components/Survey/TaskControlsOverridable"; import fetcher from "src/lib/fetcher"; import poster from "src/lib/poster"; import useSWRImmutable from "swr/immutable"; @@ -27,6 +27,7 @@ const RankAssistantReplies = () => { const { trigger } = useSWRMutation("/api/update_task", poster, { onSuccess: async (data) => { const newTask = await data.json(); + console.log("response", newTask); setTasks((oldTasks) => [...oldTasks, newTask]); }, }); @@ -80,7 +81,13 @@ const RankAssistantReplies = () => { - + setRanking(tasks[0].task.replies.map((_, idx) => idx))} + onSubmitResponse={submitResponse} + onSkip={fetchNextTask} + /> ); diff --git a/website/src/pages/evaluate/rank_initial_prompts.tsx b/website/src/pages/evaluate/rank_initial_prompts.tsx index 48a67e90..6954a403 100644 --- a/website/src/pages/evaluate/rank_initial_prompts.tsx +++ b/website/src/pages/evaluate/rank_initial_prompts.tsx @@ -4,7 +4,7 @@ import { useState } from "react"; import { LoadingScreen } from "src/components/Loading/LoadingScreen"; import { Sortable } from "src/components/Sortable/Sortable"; import { SurveyCard } from "src/components/Survey/SurveyCard"; -import { TaskControls } from "src/components/Survey/TaskControls"; +import { TaskControlsOverridable } from "src/components/Survey/TaskControlsOverridable"; import fetcher from "src/lib/fetcher"; import poster from "src/lib/poster"; import useSWRImmutable from "swr/immutable"; @@ -79,7 +79,13 @@ const RankInitialPrompts = () => { - + setRanking(tasks[0].task.prompts.map((_, idx) => idx))} + onSubmitResponse={submitResponse} + onSkip={fetchNextTask} + /> ); diff --git a/website/src/pages/evaluate/rank_user_replies.tsx b/website/src/pages/evaluate/rank_user_replies.tsx index ea57f9a0..c0f87f44 100644 --- a/website/src/pages/evaluate/rank_user_replies.tsx +++ b/website/src/pages/evaluate/rank_user_replies.tsx @@ -4,7 +4,7 @@ import { useState } from "react"; import { LoadingScreen } from "src/components/Loading/LoadingScreen"; import { Sortable } from "src/components/Sortable/Sortable"; import { SurveyCard } from "src/components/Survey/SurveyCard"; -import { TaskControls } from "src/components/Survey/TaskControls"; +import { TaskControlsOverridable } from "src/components/Survey/TaskControlsOverridable"; import fetcher from "src/lib/fetcher"; import poster from "src/lib/poster"; import useSWRImmutable from "swr/immutable"; @@ -79,7 +79,13 @@ const RankUserReplies = () => { - + setRanking(tasks[0].task.replies.map((_, idx) => idx))} + onSubmitResponse={submitResponse} + onSkip={fetchNextTask} + /> ); From afc29975d1f30ce766246712e4031b5ce561c65f Mon Sep 17 00:00:00 2001 From: Yannic Kilcher Date: Wed, 4 Jan 2023 00:34:50 +0100 Subject: [PATCH 023/121] added note to schemas --- docs/data_schemas.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/data_schemas.md b/docs/data_schemas.md index 351e6bd4..f12eda84 100644 --- a/docs/data_schemas.md +++ b/docs/data_schemas.md @@ -10,6 +10,9 @@ Also, the schemas are leaning heavily on the [OpenAssistant Data Structures](https://docs.google.com/presentation/d/1iaX_nxasVWlvPiSNs0cllR9L_1neZq0RJxd6MFEalUY/edit?usp=sharing) presentation. +_Note on conformity: be pragmatic and decide what makes sense 🙂 , it's more +important that we move forward than cramming everything into a uniform thing._ + ## Data Schemas ### Main structure: conversation trees From c80f89f545bed87608660d2de98d96515ad33e40 Mon Sep 17 00:00:00 2001 From: Kostia Date: Wed, 4 Jan 2023 01:59:15 +0200 Subject: [PATCH 024/121] Added context messages to fix #339 --- website/src/components/ContextMessages.tsx | 19 +++++++++++++++++++ .../pages/evaluate/rank_assistant_replies.tsx | 6 +++++- .../src/pages/evaluate/rank_user_replies.tsx | 4 ++++ 3 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 website/src/components/ContextMessages.tsx diff --git a/website/src/components/ContextMessages.tsx b/website/src/components/ContextMessages.tsx new file mode 100644 index 00000000..2b491ac7 --- /dev/null +++ b/website/src/components/ContextMessages.tsx @@ -0,0 +1,19 @@ +import { Box } from "@chakra-ui/react"; +import { Message } from "./Messages"; + +export const ContextMessages = ({ messages } : { messages: Message[]}) => { + console.log(messages) + return ( + + {messages.map((message, i) => { + return ( + + {message.is_assistant ? "Assistant: " : "User: "} + {message.text} + + ); + })} + + ); +}; + diff --git a/website/src/pages/evaluate/rank_assistant_replies.tsx b/website/src/pages/evaluate/rank_assistant_replies.tsx index e18d228a..1c5208d3 100644 --- a/website/src/pages/evaluate/rank_assistant_replies.tsx +++ b/website/src/pages/evaluate/rank_assistant_replies.tsx @@ -1,7 +1,9 @@ import { useColorMode } from "@chakra-ui/react"; import Head from "next/head"; import { useState } from "react"; +import { ContextMessages } from "src/components/ContextMessages"; import { LoadingScreen } from "src/components/Loading/LoadingScreen"; +import { Message } from "src/components/Messages"; import { Sortable } from "src/components/Sortable/Sortable"; import { SurveyCard } from "src/components/Survey/SurveyCard"; import { TaskControls } from "src/components/Survey/TaskControls"; @@ -64,7 +66,8 @@ const RankAssistantReplies = () => { } const replies = tasks[0].task.replies as string[]; - + const messages = tasks[0].task.conversation.messages as Message[]; + return ( <> @@ -77,6 +80,7 @@ const RankAssistantReplies = () => {

Given the following replies, sort them from best to worst, best being first, worst being last.

+ diff --git a/website/src/pages/evaluate/rank_user_replies.tsx b/website/src/pages/evaluate/rank_user_replies.tsx index ea57f9a0..7c603c3a 100644 --- a/website/src/pages/evaluate/rank_user_replies.tsx +++ b/website/src/pages/evaluate/rank_user_replies.tsx @@ -9,6 +9,8 @@ import fetcher from "src/lib/fetcher"; import poster from "src/lib/poster"; import useSWRImmutable from "swr/immutable"; import useSWRMutation from "swr/mutation"; +import {Message} from "src/components/Messages" +import { ContextMessages } from "src/components/ContextMessages"; const RankUserReplies = () => { const [tasks, setTasks] = useState([]); @@ -63,6 +65,7 @@ const RankUserReplies = () => { ); } const replies = tasks[0].task.replies as string[]; + const messages = tasks[0].task.conversation.messages as Message[]; return ( <> @@ -76,6 +79,7 @@ const RankUserReplies = () => {

Given the following replies, sort them from best to worst, best being first, worst being last.

+ From 8aad3beda854c6cae4b9dfccebc1c102489c101c Mon Sep 17 00:00:00 2001 From: Kostia Date: Wed, 4 Jan 2023 02:01:53 +0200 Subject: [PATCH 025/121] pre-commit fixes --- website/src/components/ContextMessages.tsx | 28 +++++++++---------- .../pages/evaluate/rank_assistant_replies.tsx | 2 +- .../src/pages/evaluate/rank_user_replies.tsx | 2 +- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/website/src/components/ContextMessages.tsx b/website/src/components/ContextMessages.tsx index 2b491ac7..150dddfa 100644 --- a/website/src/components/ContextMessages.tsx +++ b/website/src/components/ContextMessages.tsx @@ -1,19 +1,17 @@ import { Box } from "@chakra-ui/react"; import { Message } from "./Messages"; -export const ContextMessages = ({ messages } : { messages: Message[]}) => { - console.log(messages) - return ( - - {messages.map((message, i) => { - return ( - - {message.is_assistant ? "Assistant: " : "User: "} - {message.text} - - ); - })} - - ); +export const ContextMessages = ({ messages }: { messages: Message[] }) => { + return ( + + {messages.map((message, i) => { + return ( + + {message.is_assistant ? "Assistant: " : "User: "} + {message.text} + + ); + })} + + ); }; - diff --git a/website/src/pages/evaluate/rank_assistant_replies.tsx b/website/src/pages/evaluate/rank_assistant_replies.tsx index 1c5208d3..e8558c00 100644 --- a/website/src/pages/evaluate/rank_assistant_replies.tsx +++ b/website/src/pages/evaluate/rank_assistant_replies.tsx @@ -67,7 +67,7 @@ const RankAssistantReplies = () => { const replies = tasks[0].task.replies as string[]; const messages = tasks[0].task.conversation.messages as Message[]; - + return ( <> diff --git a/website/src/pages/evaluate/rank_user_replies.tsx b/website/src/pages/evaluate/rank_user_replies.tsx index 7c603c3a..250b17fc 100644 --- a/website/src/pages/evaluate/rank_user_replies.tsx +++ b/website/src/pages/evaluate/rank_user_replies.tsx @@ -9,7 +9,7 @@ import fetcher from "src/lib/fetcher"; import poster from "src/lib/poster"; import useSWRImmutable from "swr/immutable"; import useSWRMutation from "swr/mutation"; -import {Message} from "src/components/Messages" +import { Message } from "src/components/Messages"; import { ContextMessages } from "src/components/ContextMessages"; const RankUserReplies = () => { From f7bd22246e92cdbc701322c3b670df942a4ab810 Mon Sep 17 00:00:00 2001 From: theblackcat102 Date: Wed, 4 Jan 2023 01:00:37 +0000 Subject: [PATCH 026/121] [fix] rename old summary human feedback dataset to new one --- model/reward/instructor/experimental_dataset.py | 2 +- model/reward/instructor/rank_datasets.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/model/reward/instructor/experimental_dataset.py b/model/reward/instructor/experimental_dataset.py index d8fb60d7..8ff4f9e7 100644 --- a/model/reward/instructor/experimental_dataset.py +++ b/model/reward/instructor/experimental_dataset.py @@ -60,7 +60,7 @@ class HFSummaryQuality(Dataset): def __init__(self, split, tokenizer, max_length=300) -> None: super().__init__() assert split in ("validation", "test") - dataset = load_dataset("Tristan/summarize_from_feedback", "axis")[split] + dataset = load_dataset("openai/summarize_from_feedback", "axis")[split] self.max_length = max_length mean_scores = defaultdict(list) self.contexts = [] diff --git a/model/reward/instructor/rank_datasets.py b/model/reward/instructor/rank_datasets.py index f63af85a..a5c4b4fd 100644 --- a/model/reward/instructor/rank_datasets.py +++ b/model/reward/instructor/rank_datasets.py @@ -118,7 +118,7 @@ class HFSummary(Dataset): self.index2summary = {} self.max_comparison_per_sample = max_comparison_per_sample major_split = split if "train" == split else "validation" - dataset = load_dataset("Tristan/summarize_from_feedback", "comparisons")[major_split] + dataset = load_dataset("openai/summarize_from_feedback", "comparisons")[major_split] for data in dataset: if ( "extra" in data From 4569bcf354a23b986463a9f70ee564785e423ef2 Mon Sep 17 00:00:00 2001 From: Bobak Hashemi Date: Tue, 3 Jan 2023 20:47:33 -0500 Subject: [PATCH 027/121] fixed linting --- model/reward/instructor/models.py | 44 +++++++++++++----------- model/reward/instructor/rank_datasets.py | 26 ++++++++------ model/reward/instructor/requirements.txt | 2 +- model/reward/instructor/trainer.py | 23 +++++-------- model/reward/instructor/utils.py | 2 +- 5 files changed, 51 insertions(+), 46 deletions(-) diff --git a/model/reward/instructor/models.py b/model/reward/instructor/models.py index dc7692bf..084cfa51 100644 --- a/model/reward/instructor/models.py +++ b/model/reward/instructor/models.py @@ -1,24 +1,28 @@ +# -*- coding: utf-8 -*- import torch from transformers import AutoModel -class RankGenModel(torch.nn.Module): - def __init__(self, model_name): - super().__init__() - self.rankgen_hf_hub = model_name - assert model_name in ["kalpeshk2011/rankgen-t5-xl-all", - "kalpeshk2011/rankgen-t5-xl-pg19", - "kalpeshk2011/rankgen-t5-base-all", - "kalpeshk2011/rankgen-t5-large-all"] - self.model = AutoModel.from_pretrained(self.rankgen_hf_hub, trust_remote_code=True) - def forward(self, prefixes, suffixes): - # print(list(self.model.parameters())) - # raise Exception("stop") - embedded_prefixes = self.model(**prefixes) - embedded_suffixes = self.model(**suffixes) - # take dot product of each row independently - dot_products = torch.sum(embedded_prefixes * embedded_suffixes, dim=1) - - # print(f"{embedded_prefixes.shape=}, {embedded_suffixes.shape=}, {prefixes['input_ids'].shape=}, {suffixes['input_ids'].shape=}, {embedded_prefixes=}, {embedded_suffixes=}, {dot_products=}") - # raise Exception("stop") - return dot_products \ No newline at end of file +class RankGenModel(torch.nn.Module): + def __init__(self, model_name): + super().__init__() + self.rankgen_hf_hub = model_name + assert model_name in [ + "kalpeshk2011/rankgen-t5-xl-all", + "kalpeshk2011/rankgen-t5-xl-pg19", + "kalpeshk2011/rankgen-t5-base-all", + "kalpeshk2011/rankgen-t5-large-all", + ] + self.model = AutoModel.from_pretrained(self.rankgen_hf_hub, trust_remote_code=True) + + def forward(self, prefixes, suffixes): + # print(list(self.model.parameters())) + # raise Exception("stop") + embedded_prefixes = self.model(**prefixes) + embedded_suffixes = self.model(**suffixes) + # take dot product of each row independently + dot_products = torch.sum(embedded_prefixes * embedded_suffixes, dim=1) + + # print(f"{embedded_prefixes.shape=}, {embedded_suffixes.shape=}, {prefixes['input_ids'].shape=}, {suffixes['input_ids'].shape=}, {embedded_prefixes=}, {embedded_suffixes=}, {dot_products=}") + # raise Exception("stop") + return dot_products diff --git a/model/reward/instructor/rank_datasets.py b/model/reward/instructor/rank_datasets.py index 965893ce..a63c9e02 100644 --- a/model/reward/instructor/rank_datasets.py +++ b/model/reward/instructor/rank_datasets.py @@ -23,19 +23,20 @@ from dataclasses import dataclass from typing import Optional, Union import numpy as np -from datasets import load_dataset import torch +from datasets import load_dataset from torch.utils.data import Dataset from transformers.tokenization_utils_base import PaddingStrategy, PreTrainedTokenizerBase + @dataclass -class RankGenCollator(): +class RankGenCollator: tokenizer: PreTrainedTokenizerBase padding: Union[bool, str, PaddingStrategy] = True max_length: Optional[int] = None max_examples: Optional[int] = None - def __call__(self, batch : list[dict[str, str]]) -> dict[str, torch.Tensor]: + def __call__(self, batch: list[dict[str, str]]) -> dict[str, torch.Tensor]: prefixes = [] better_answers = [] worse_answers = [] @@ -44,13 +45,18 @@ class RankGenCollator(): prefixes.append("pre " + question) better_answers.append("suffi " + pos) worse_answers.append("suffi " + neg) - - tokenized_prefixes = self.tokenizer(prefixes, return_tensors="pt", padding=self.padding, max_length=self.max_length, truncation=True) - tokenized_pos = self.tokenizer(better_answers, return_tensors="pt", padding=self.padding, max_length=self.max_length, truncation=True) - tokenized_neg = self.tokenizer(worse_answers, return_tensors="pt", padding=self.padding, max_length=self.max_length, truncation=True) - return {"prefix" : tokenized_prefixes, - "positive": tokenized_pos, - "negative": tokenized_neg} + + tokenized_prefixes = self.tokenizer( + prefixes, return_tensors="pt", padding=self.padding, max_length=self.max_length, truncation=True + ) + tokenized_pos = self.tokenizer( + better_answers, return_tensors="pt", padding=self.padding, max_length=self.max_length, truncation=True + ) + tokenized_neg = self.tokenizer( + worse_answers, return_tensors="pt", padding=self.padding, max_length=self.max_length, truncation=True + ) + return {"prefix": tokenized_prefixes, "positive": tokenized_pos, "negative": tokenized_neg} + @dataclass class DataCollatorForPairRank: diff --git a/model/reward/instructor/requirements.txt b/model/reward/instructor/requirements.txt index eaaf36e6..ca3935e4 100644 --- a/model/reward/instructor/requirements.txt +++ b/model/reward/instructor/requirements.txt @@ -1,7 +1,7 @@ datasets==2.8.0 evaluate==0.4.0 scikit-learn==1.2.0 +sentencepiece==0.1.97 torch>=1.12.1 transformers==4.25.1 wandb==0.13.7 -sentencepiece==0.1.97 diff --git a/model/reward/instructor/trainer.py b/model/reward/instructor/trainer.py index c6f58f66..124c28f8 100644 --- a/model/reward/instructor/trainer.py +++ b/model/reward/instructor/trainer.py @@ -7,11 +7,11 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union import evaluate import numpy as np import torch +from models import RankGenModel from rank_datasets import DataCollatorForPairRank, HFSummary, RankGenCollator, WebGPT from torch import nn from torch.utils.data import ConcatDataset, Dataset from transformers import ( - AutoModel, AutoModelForSequenceClassification, DataCollator, EvalPrediction, @@ -21,7 +21,6 @@ from transformers import ( TrainerCallback, TrainingArguments, ) -from models import RankGenModel from utils import argument_parsing, freeze_top_n_layers, get_tokenizer, train_val_dataset os.environ["WANDB_PROJECT"] = "reward-model" @@ -95,7 +94,7 @@ class RankTrainer(Trainer): loss = self.loss_fct(positive_outputs, negative_outputs) else: raise NotImplementedError("Only ranking loss has been implemented for rankgen model") - outputs = torch.hstack((positive_outputs, negative_outputs)) #logits + outputs = torch.hstack((positive_outputs, negative_outputs)) # logits else: outputs = model(**inputs) logits = outputs.get("logits").view(-1, 2) @@ -133,7 +132,7 @@ class RankTrainer(Trainer): loss = self.loss_fct(positive_outputs, negative_outputs) else: raise NotImplementedError("Only ranking loss has been implemented for rankgen model") - outputs = torch.hstack((positive_outputs, negative_outputs)) # logits + outputs = torch.hstack((positive_outputs, negative_outputs)) # logits return (loss, outputs, None) else: # compute loss on predict data @@ -161,7 +160,7 @@ if __name__ == "__main__": model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print("Number of trainable : {}M".format(int(params / 1e6))) - + args = CustomTrainingArguments( output_dir=f"{model_name}-finetuned", num_train_epochs=training_conf["num_train_epochs"], @@ -196,20 +195,16 @@ if __name__ == "__main__": assert len(sum_eval) > 0 evals["hfsummary"] = sum_eval train = ConcatDataset(train_datasets) - + if "tokenizer_name" in training_conf: - tokenizer=get_tokenizer(training_conf["tokenizer_name"]) + tokenizer = get_tokenizer(training_conf["tokenizer_name"]) else: tokenizer = get_tokenizer(model_name) - + if "rankgen" in model_name: - collate_fn = RankGenCollator( - tokenizer, max_length=training_conf["max_length"] - ) + collate_fn = RankGenCollator(tokenizer, max_length=training_conf["max_length"]) else: - collate_fn = DataCollatorForPairRank( - tokenizer, max_length=training_conf["max_length"] - ) + collate_fn = DataCollatorForPairRank(tokenizer, max_length=training_conf["max_length"]) assert len(evals) > 0 trainer = RankTrainer( model=model, diff --git a/model/reward/instructor/utils.py b/model/reward/instructor/utils.py index 59165598..780ac9c8 100644 --- a/model/reward/instructor/utils.py +++ b/model/reward/instructor/utils.py @@ -26,7 +26,7 @@ def webgpt_return_format(row): def get_tokenizer(tokenizer_name): - if "t5" in tokenizer_name: #rankgen + if "t5" in tokenizer_name: # rankgen tokenizer = T5Tokenizer.from_pretrained(tokenizer_name, truncation_side="left") else: tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) From 5ed4131720e60c38a4b0a5873f23d1c082d52d9f Mon Sep 17 00:00:00 2001 From: Jack Michaud Date: Tue, 3 Jan 2023 20:51:11 -0500 Subject: [PATCH 028/121] ci: run contract tests through separate cypress command and add into CI --- .github/workflows/test-api-contract.yaml | 10 +++++++++- scripts/frontend-development/run-contract-test.sh | 11 +++++++++++ website/cypress.config.contract.js | 9 +++++++++ .../{e2e => contract}/oasst_api_contract_tests.cy.ts | 1 + website/package.json | 1 + 5 files changed, 31 insertions(+), 1 deletion(-) create mode 100755 scripts/frontend-development/run-contract-test.sh create mode 100644 website/cypress.config.contract.js rename website/cypress/{e2e => contract}/oasst_api_contract_tests.cy.ts (93%) diff --git a/.github/workflows/test-api-contract.yaml b/.github/workflows/test-api-contract.yaml index 3707f4de..a541e887 100644 --- a/.github/workflows/test-api-contract.yaml +++ b/.github/workflows/test-api-contract.yaml @@ -15,6 +15,9 @@ jobs: - uses: actions/setup-python@v4 with: python-version: "3.10" + - uses: actions/setup-node@v3 + with: + node-version: 16 - run: cd oasst-shared && pip install -e . @@ -22,9 +25,14 @@ jobs: - run: cd backend && pip install -r requirements.txt + - run: cd frontend && npm install + - run: ./scripts/backend-development/start-mock-server.sh - - name: Run contract tests + - name: Run Python OasstApiClient contract tests run: ./scripts/oasst-shared-development/test.sh + - name: Run JavaScript OasstApiClient contract tests + run: ./scripts/frontend-development/run-contract-test.sh + - run: ./scripts/backend-development/stop-mock-server.sh diff --git a/scripts/frontend-development/run-contract-test.sh b/scripts/frontend-development/run-contract-test.sh new file mode 100755 index 00000000..6bedc903 --- /dev/null +++ b/scripts/frontend-development/run-contract-test.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) + +# switch to website directory +pushd "$parent_path/../../website" + +set -xe + +npm run cypress:run:contract + +popd diff --git a/website/cypress.config.contract.js b/website/cypress.config.contract.js new file mode 100644 index 00000000..f4461158 --- /dev/null +++ b/website/cypress.config.contract.js @@ -0,0 +1,9 @@ +import { defineConfig } from "cypress"; + +export default defineConfig({ + e2e: { + // No baseUrl here, because we don't need it for contract testing + baseUrl: null, + specPattern: "cypress/contract/*.cy.{ts,js}", + }, +}); diff --git a/website/cypress/e2e/oasst_api_contract_tests.cy.ts b/website/cypress/contract/oasst_api_contract_tests.cy.ts similarity index 93% rename from website/cypress/e2e/oasst_api_contract_tests.cy.ts rename to website/cypress/contract/oasst_api_contract_tests.cy.ts index 94545358..2570acec 100644 --- a/website/cypress/e2e/oasst_api_contract_tests.cy.ts +++ b/website/cypress/contract/oasst_api_contract_tests.cy.ts @@ -1,6 +1,7 @@ import OasstApiClient from "src/lib/oasst_api_client"; describe("Contract test for Oasst API", function () { + // Assumes this is running the mock server. const oasstApiClient = new OasstApiClient("http://localhost:8080", "test"); it("can fetch a task", async () => { diff --git a/website/package.json b/website/package.json index c1d0c3d2..e5240727 100644 --- a/website/package.json +++ b/website/package.json @@ -12,6 +12,7 @@ "build-storybook": "build-storybook", "cypress": "cypress open", "cypress:run": "cypress run", + "cypress:run:contract": "cypress run --config-file ./cypress.config.contract.js", "cypress:image-baseline": "cypress-image-diff -u", "fix:lint": "eslint --fix src/ --ext .js,.jsx,.ts,.tsx", "fix:format": "prettier --write ./src", From 7b5f702a0013dd140953f24189f69a6d84ef9593 Mon Sep 17 00:00:00 2001 From: Jack Michaud Date: Tue, 3 Jan 2023 20:54:49 -0500 Subject: [PATCH 029/121] docs: add TODOs --- website/cypress/contract/oasst_api_contract_tests.cy.ts | 4 ++++ website/src/pages/api/update_task.ts | 1 + 2 files changed, 5 insertions(+) diff --git a/website/cypress/contract/oasst_api_contract_tests.cy.ts b/website/cypress/contract/oasst_api_contract_tests.cy.ts index 2570acec..0f9ddd00 100644 --- a/website/cypress/contract/oasst_api_contract_tests.cy.ts +++ b/website/cypress/contract/oasst_api_contract_tests.cy.ts @@ -22,4 +22,8 @@ describe("Contract test for Oasst API", function () { }); expect(await oasstApiClient.ackTask(task.id, "321")).to.be.null; }); + + // TODO Add test for 204 + // TODO Add test for parsing >=300, throwing an OasstError + // TODO Add test for parsing >=300, throwing a generic error }); diff --git a/website/src/pages/api/update_task.ts b/website/src/pages/api/update_task.ts index 9582040b..5e887175 100644 --- a/website/src/pages/api/update_task.ts +++ b/website/src/pages/api/update_task.ts @@ -36,6 +36,7 @@ const handler = async (req, res) => { // Send the interaction to the Task Backend. This automatically fetches the // next task in the sequence (or the done task). + // TODO Move this into OasstApiClient. const interactionRes = await fetch(`${process.env.FASTAPI_URL}/api/v1/tasks/interaction`, { method: "POST", headers: { From a95e71d6f9b8c46d8f6d875cb1cce12bfd549e63 Mon Sep 17 00:00:00 2001 From: Jack Michaud Date: Tue, 3 Jan 2023 20:59:37 -0500 Subject: [PATCH 030/121] ci: cd into correct directory to install deps --- .github/workflows/test-api-contract.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-api-contract.yaml b/.github/workflows/test-api-contract.yaml index a541e887..4ca36da0 100644 --- a/.github/workflows/test-api-contract.yaml +++ b/.github/workflows/test-api-contract.yaml @@ -25,7 +25,7 @@ jobs: - run: cd backend && pip install -r requirements.txt - - run: cd frontend && npm install + - run: cd website && npm install - run: ./scripts/backend-development/start-mock-server.sh From 65524a2140f2c7b1f2b2ecc4b7beeb29df76d639 Mon Sep 17 00:00:00 2001 From: b-mc2 <78936105+b-mc2@users.noreply.github.com> Date: Tue, 3 Jan 2023 21:36:05 -0500 Subject: [PATCH 031/121] StackExchange Builder update Included markdown file and the initial notebook for ingesting and processing StackExchange data dumps --- .../StackExchangeBuilder.ipynb | 1833 +++++++++++++++++ .../StackExchangeBuilder.md | 98 + 2 files changed, 1931 insertions(+) create mode 100644 notebooks/data-argumentation/StackExchangeBuilder.ipynb create mode 100644 notebooks/data-argumentation/StackExchangeBuilder.md diff --git a/notebooks/data-argumentation/StackExchangeBuilder.ipynb b/notebooks/data-argumentation/StackExchangeBuilder.ipynb new file mode 100644 index 00000000..ac215cbe --- /dev/null +++ b/notebooks/data-argumentation/StackExchangeBuilder.ipynb @@ -0,0 +1,1833 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Ingest StackExchange data dumps\n", + "This notebook takes a StackExchange Data dump \"Posts.xml\" file and ingests it into a Pandas Dataframe. Outputs of the file can be JSON, JSONL, Parquet, or CSV. " + ], + "metadata": { + "id": "TB7CEfs8F-8u" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0rHryQttyzyY" + }, + "outputs": [], + "source": [ + "from bs4 import BeautifulSoup as bs\n", + "import pandas as pd\n", + "import requests\n", + "import json" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Extract StackExchange\n", + "Pull StackExchange file dumps. Specific column types are enforced to prevent errors on processing later in the notebook" + ], + "metadata": { + "id": "15mAL7GnzBv0" + } + }, + { + "cell_type": "code", + "source": [ + "\n", + "base_url = \"https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/{0}&file=Posts.xml\"\n", + "\n", + "def get_all_filenames():\n", + " response = requests.get(\"https://archive.org/download/stackexchange\")\n", + " if response.ok:\n", + " soup = bs(response.content, 'html.parser')\n", + " table = soup.find('table')\n", + " link_tags = table.find_all('a')\n", + " urls = {}\n", + " for link in link_tags:\n", + " url = link['href']\n", + " name = url.split('.stackexchange')[0].replace('.', '_').replace('-', '_')\n", + " if url.endswith('7z'):\n", + " urls[name] = base_url.format(url)\n", + " return urls\n", + "\n", + "urls = get_all_filenames()\n", + "\n", + "print(urls.keys())\n", + "print(urls.get('ai'))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FtcvUEaHVxcW", + "outputId": "5b0cb19d-e3d9-422b-9077-52241bd09e0e" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "dict_keys(['3dprinting_meta', '3dprinting', 'Stackoverflow_com_Posts_7z', 'academia_meta', 'academia', 'ai_meta', 'ai', 'android_meta', 'android', 'anime_meta', 'anime', 'apple_meta', 'apple', 'arduino_meta', 'arduino', 'askubuntu_com_7z', 'astronomy_meta', 'astronomy', 'aviation_meta', 'aviation', 'avp_meta', 'avp', 'beer_meta', 'beer', 'bicycles_meta', 'bicycles', 'bioacoustics_meta', 'bioacoustics', 'bioinformatics_meta', 'bioinformatics', 'biology_meta', 'biology', 'bitcoin_meta', 'bitcoin', 'blender_meta', 'blender', 'boardgames_meta', 'boardgames', 'bricks_meta', 'bricks', 'buddhism_meta', 'buddhism', 'cardano_meta', 'cardano', 'chemistry_meta', 'chemistry', 'chess_meta', 'chess', 'chinese_meta', 'chinese', 'christianity_meta', 'christianity', 'civicrm_meta', 'civicrm', 'codegolf_meta', 'codegolf', 'codereview_meta', 'codereview', 'coffee_meta', 'coffee', 'cogsci_meta', 'cogsci', 'computergraphics_meta', 'computergraphics', 'conlang_meta', 'conlang', 'cooking_meta', 'cooking', 'craftcms_meta', 'craftcms', 'crafts_meta', 'crafts', 'crypto_meta', 'crypto', 'cs_meta', 'cs', 'cseducators_meta', 'cseducators', 'cstheory_meta', 'cstheory', 'datascience_meta', 'datascience', 'dba_meta', 'dba', 'devops_meta', 'devops', 'diy_meta', 'diy', 'drones_meta', 'drones', 'drupal_meta', 'drupal', 'dsp_meta', 'dsp', 'earthscience_meta', 'earthscience', 'ebooks_meta', 'ebooks', 'economics_meta', 'economics', 'electronics_meta', 'electronics', 'elementaryos_meta', 'elementaryos', 'ell_meta', 'ell', 'emacs_meta', 'emacs', 'engineering_meta', 'engineering', 'english_meta', 'english', 'eosio_meta', 'eosio', 'es_meta_stackoverflow_com_7z', 'es_stackoverflow_com_7z', 'esperanto_meta', 'esperanto', 'ethereum_meta', 'ethereum', 'expatriates_meta', 'expatriates', 'expressionengine_meta', 'expressionengine', 'fitness_meta', 'fitness', 'freelancing_meta', 'freelancing', 'french_meta', 'french', 'gamedev_meta', 'gamedev', 'gaming_meta', 'gaming', 'gardening_meta', 'gardening', 'genealogy_meta', 'genealogy', 'german_meta', 'german', 'gis_meta', 'gis', 'graphicdesign_meta', 'graphicdesign', 'ham_meta', 'ham', 'hardwarerecs_meta', 'hardwarerecs', 'health_meta', 'health', 'hermeneutics_meta', 'hermeneutics', 'hinduism_meta', 'hinduism', 'history_meta', 'history', 'homebrew_meta', 'homebrew', 'hsm_meta', 'hsm', 'interpersonal_meta', 'interpersonal', 'iot_meta', 'iot', 'iota_meta', 'iota', 'islam_meta', 'islam', 'italian_meta', 'italian', 'ja_meta_stackoverflow_com_7z', 'ja_stackoverflow_com_7z', 'japanese_meta', 'japanese', 'joomla_meta', 'joomla', 'judaism_meta', 'judaism', 'korean_meta', 'korean', 'languagelearning_meta', 'languagelearning', 'latin_meta', 'latin', 'law_meta', 'law', 'lifehacks_meta', 'lifehacks', 'linguistics_meta', 'linguistics', 'literature_meta', 'literature', 'magento_meta', 'magento', 'martialarts_meta', 'martialarts', 'materials_meta', 'materials', 'math_meta', 'math', 'matheducators_meta', 'matheducators', 'mathematica_meta', 'mathematica', 'mathoverflow_net_7z', 'mechanics_meta', 'mechanics', 'meta_askubuntu_com_7z', 'meta_mathoverflow_net_7z', 'meta_serverfault_com_7z', 'meta', 'meta_stackoverflow_com_7z', 'meta_superuser_com_7z', 'moderators_meta', 'moderators', 'monero_meta', 'monero', 'money_meta', 'money', 'movies_meta', 'movies', 'music_meta', 'music', 'musicfans_meta', 'musicfans', 'mythology_meta', 'mythology', 'networkengineering_meta', 'networkengineering', 'opendata_meta', 'opendata', 'opensource_meta', 'opensource', 'or_meta', 'or', 'outdoors_meta', 'outdoors', 'parenting_meta', 'parenting', 'patents_meta', 'patents', 'pets_meta', 'pets', 'philosophy_meta', 'philosophy', 'photo_meta', 'photo', 'physics_meta', 'physics', 'pm_meta', 'pm', 'poker_meta', 'poker', 'politics_meta', 'politics', 'portuguese_meta', 'portuguese', 'proofassistants_meta', 'proofassistants', 'pt_meta_stackoverflow_com_7z', 'pt_stackoverflow_com_7z', 'puzzling_meta', 'puzzling', 'quant_meta', 'quant', 'quantumcomputing_meta', 'quantumcomputing', 'raspberrypi_meta', 'raspberrypi', 'retrocomputing_meta', 'retrocomputing', 'reverseengineering_meta', 'reverseengineering', 'robotics_meta', 'robotics', 'rpg_meta', 'rpg', 'ru_meta_stackoverflow_com_7z', 'ru_stackoverflow_com_7z', 'rus_meta', 'rus', 'russian_meta', 'russian', 'salesforce_meta', 'salesforce', 'scicomp_meta', 'scicomp', 'scifi_meta', 'scifi', 'security_meta', 'security', 'serverfault_com_7z', 'sharepoint_meta', 'sharepoint', 'sitecore_meta', 'sitecore', 'skeptics_meta', 'skeptics', 'softwareengineering_meta', 'softwareengineering', 'softwarerecs_meta', 'softwarerecs', 'solana_meta', 'solana', 'sound_meta', 'sound', 'space_meta', 'space', 'spanish_meta', 'spanish', 'sports_meta', 'sports', 'sqa_meta', 'sqa', 'stackapps_com_7z', 'stackoverflow_com_Badges_7z', 'stackoverflow_com_Comments_7z', 'stackoverflow_com_PostHistory_7z', 'stackoverflow_com_PostLinks_7z', 'stackoverflow_com_Tags_7z', 'stackoverflow_com_Users_7z', 'stackoverflow_com_Votes_7z', 'stats_meta', 'stats', 'stellar_meta', 'stellar', 'substrate_meta', 'substrate', 'superuser_com_7z', 'sustainability_meta', 'sustainability', 'tex_meta', 'tex', 'tezos_meta', 'tezos', 'tor_meta', 'tor', 'travel_meta', 'travel', 'tridion_meta', 'tridion', 'ukrainian_meta', 'ukrainian', 'unix_meta', 'unix', 'ux_meta', 'ux', 'vegetarianism_meta', 'vegetarianism', 'vi_meta', 'vi', 'webapps_meta', 'webapps', 'webmasters_meta', 'webmasters', 'windowsphone_meta', 'windowsphone', 'woodworking_meta', 'woodworking', 'wordpress_meta', 'wordpress', 'workplace_meta', 'workplace', 'worldbuilding_meta', 'worldbuilding', 'writers_meta', 'writers'])\n", + "https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/ai.stackexchange.com.7z&file=Posts.xml\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "\n", + "xml_format_map = {'Id': int, 'PostTypeId': int, 'CreationDate': str, \n", + " 'Score': int, 'ViewCount': int, 'Body': str, 'AnswerCount': int, \n", + " 'CommentCount': int, 'ContentLicense': str, 'AcceptedAnswerId': int, \"ParentId\": int}\n", + "\n", + "\n", + "# def extract_xml_file(file_url: str):\n", + "# table = pd.read_xml(file_url)\n", + "# return table\n", + "\n", + "def xml_to_df(response: str):\n", + " \"\"\"\n", + " Collect and Manually import XML into Dataframe\n", + "\n", + " pd.read_xml() errors when XML trees are too large, this is just a hack to\n", + " download a XML file and parse into a Dataframe. **Not Tested on huge XML files**\n", + "\n", + " Parameters:\n", + " response (Requests.Response): Requests response object with the XML data\n", + "\n", + " Returns:\n", + " df (DataFrame): A Dataframe from the XML file\n", + " \"\"\" \n", + " soup = bs(response.content, 'xml')\n", + " posts = soup.find_all('row')\n", + "\n", + " all_posts = [post.attrs for post in posts]\n", + "\n", + " df = pd.DataFrame(all_posts)\n", + " df.AnswerCount.fillna(0, inplace=True)\n", + " df.ViewCount.fillna(0, inplace=True)\n", + " df.AcceptedAnswerId.fillna(0, inplace=True)\n", + " df.ParentId.fillna(0, inplace=True)\n", + " df['DataSource'] = response.url\n", + " df = df.astype(xml_format_map)\n", + " return df\n", + "\n", + "\n", + "\n", + "dataset_name = \"ai\"\n", + "\n", + "xml_posts_path = urls.get(dataset_name)\n", + "\n", + "\n", + "# df = extract_xml_file(test)\n", + "response = requests.get(xml_posts_path)\n", + "df = xml_to_df(response)\n", + "\n", + "\n", + "print(df.dtypes)\n", + "df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 907 + }, + "id": "-t27RnxdzBYB", + "outputId": "5ec0ceed-c82b-48fa-facd-41b4aae2f9e6" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Id int64\n", + "PostTypeId int64\n", + "AcceptedAnswerId int64\n", + "CreationDate object\n", + "Score int64\n", + "ViewCount int64\n", + "Body object\n", + "OwnerUserId object\n", + "LastEditorUserId object\n", + "LastEditDate object\n", + "LastActivityDate object\n", + "Title object\n", + "Tags object\n", + "AnswerCount int64\n", + "CommentCount int64\n", + "ContentLicense object\n", + "ParentId int64\n", + "ClosedDate object\n", + "FavoriteCount object\n", + "CommunityOwnedDate object\n", + "LastEditorDisplayName object\n", + "OwnerDisplayName object\n", + "DataSource object\n", + "dtype: object\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Id PostTypeId AcceptedAnswerId CreationDate Score \\\n", + "0 1 1 3 2016-08-02T15:39:14.947 10 \n", + "1 2 1 9 2016-08-02T15:40:20.623 14 \n", + "2 3 2 0 2016-08-02T15:40:24.820 15 \n", + "3 4 1 12 2016-08-02T15:41:22.020 33 \n", + "4 6 1 20 2016-08-02T15:43:35.460 7 \n", + "\n", + " ViewCount Body OwnerUserId \\\n", + "0 710

What does \"backprop\" mean? Is the \"backprop... 8 \n", + "1 1008

Does increasing the noise in data help to i... 8 \n", + "2 0

\"Backprop\" is the same as \"backpropagation\"... 4 \n", + "3 1266

When you're writing your algorithm, how do ... 8 \n", + "4 279

Given the following definition of an intell... 29 \n", + "\n", + " LastEditorUserId LastEditDate ... AnswerCount CommentCount \\\n", + "0 2444 2019-11-16T17:56:22.093 ... 5 0 \n", + "1 2444 2019-02-23T22:36:19.090 ... 3 0 \n", + "2 NaN NaN ... 0 0 \n", + "3 2444 2021-01-19T23:54:07.813 ... 4 0 \n", + "4 2444 2019-06-15T18:25:58.513 ... 2 0 \n", + "\n", + " ContentLicense ParentId ClosedDate FavoriteCount CommunityOwnedDate \\\n", + "0 CC BY-SA 4.0 0 NaN NaN NaN \n", + "1 CC BY-SA 4.0 0 NaN NaN NaN \n", + "2 CC BY-SA 3.0 1 NaN NaN NaN \n", + "3 CC BY-SA 3.0 0 NaN NaN NaN \n", + "4 CC BY-SA 4.0 0 NaN NaN NaN \n", + "\n", + " LastEditorDisplayName OwnerDisplayName \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "\n", + " DataSource \n", + "0 https://ia600107.us.archive.org/view_archive.p... \n", + "1 https://ia600107.us.archive.org/view_archive.p... \n", + "2 https://ia600107.us.archive.org/view_archive.p... \n", + "3 https://ia600107.us.archive.org/view_archive.p... \n", + "4 https://ia600107.us.archive.org/view_archive.p... \n", + "\n", + "[5 rows x 23 columns]" + ], + "text/html": [ + "\n", + "

\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdPostTypeIdAcceptedAnswerIdCreationDateScoreViewCountBodyOwnerUserIdLastEditorUserIdLastEditDate...AnswerCountCommentCountContentLicenseParentIdClosedDateFavoriteCountCommunityOwnedDateLastEditorDisplayNameOwnerDisplayNameDataSource
01132016-08-02T15:39:14.94710710<p>What does \"backprop\" mean? Is the \"backprop...824442019-11-16T17:56:22.093...50CC BY-SA 4.00NaNNaNNaNNaNNaNhttps://ia600107.us.archive.org/view_archive.p...
12192016-08-02T15:40:20.623141008<p>Does increasing the noise in data help to i...824442019-02-23T22:36:19.090...30CC BY-SA 4.00NaNNaNNaNNaNNaNhttps://ia600107.us.archive.org/view_archive.p...
23202016-08-02T15:40:24.820150<p>\"Backprop\" is the same as \"backpropagation\"...4NaNNaN...00CC BY-SA 3.01NaNNaNNaNNaNNaNhttps://ia600107.us.archive.org/view_archive.p...
341122016-08-02T15:41:22.020331266<p>When you're writing your algorithm, how do ...824442021-01-19T23:54:07.813...40CC BY-SA 3.00NaNNaNNaNNaNNaNhttps://ia600107.us.archive.org/view_archive.p...
461202016-08-02T15:43:35.4607279<p>Given the following definition of an intell...2924442019-06-15T18:25:58.513...20CC BY-SA 4.00NaNNaNNaNNaNNaNhttps://ia600107.us.archive.org/view_archive.p...
\n", + "

5 rows × 23 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 219 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Transformations" + ], + "metadata": { + "id": "RAzTR7zY3oan" + } + }, + { + "cell_type": "code", + "source": [ + "def filter_only_questions_with_accepted_answers(df):\n", + " \"\"\" **TODO**\n", + " Filter only to Questions with Accepted Answers\n", + "\n", + " Filter dataframe by questions that have accepted answers, should also include \n", + " all rows of answers for those questions, even if not accepted.\n", + "\n", + " Parameters:\n", + " df (DataFrame): containing a \"AcceptedAnswerId\", \"Id\", and \"ParentId\" columns \n", + "\n", + " Returns:\n", + " df (DataFrame): current dataframe with filtered results\n", + " \"\"\" \n", + " df = df[(df['AcceptedAnswerId'].notnull()) | (df['ParentId'] == df['Id'])]\n", + "\n", + "\n", + "def filter_scores_above(df, question_score_threshold: int = 20, answer_score_threshold: int = 20):\n", + " \"\"\" **TODO**\n", + " Filter Dataframe by minimum scores\n", + "\n", + " Filter Question and Answer columns by score thresholds to trim lower scoring results\n", + "\n", + " Parameters:\n", + " df (DataFrame): containing a \"Score\" column\n", + "\n", + " Returns:\n", + " df (DataFrame): current dataframe with filtered results \n", + " \"\"\" \n", + " df = df[\n", + " ((df['Score'] >= question_score_threshold) & (df.PostTypeId == 1)) | \n", + " ((df['Score'] >= answer_score_threshold) & (df.PostTypeId == 2))\n", + " ]\n", + "\n", + "\n", + "def convert_html_to_text(df, column: str = \"Body\"):\n", + " \"\"\"\n", + " Convert HTML tags to pure text\n", + "\n", + " Feeds HTML text body into BeautifulSoup to parse it to only text. Set aside as \n", + " function to provide option to skip\n", + "\n", + " Parameters:\n", + " df (DataFrame): containing a \"Body\" column with HTML \n", + "\n", + " Returns:\n", + " df (DataFrame): current dataframe with parsed column \n", + " \"\"\" \n", + " df.dropna(subset=[column], inplace=True)\n", + " df[f\"{column}Clean\"] = df[column].apply(lambda row: bs(row, 'html.parser').text)\n", + "\n", + "\n", + "def clean_tags(df):\n", + " \"\"\"\n", + " Convert Tags into Comma separated\n", + "\n", + " Converts Tag slugs into commas separated tags\n", + "\n", + " Parameters:\n", + " df (DataFrame): containing a \"Tags\" column with slugs\n", + "\n", + " Returns:\n", + " df (DataFrame): current dataframe with parsed column\n", + " \"\"\" \n", + " df['TagsClean'] = (df['Tags']\n", + " .str.replace('-',' ')\n", + " .str.replace('><', ', ')\n", + " .str.replace('<','')\n", + " .str.replace('>','')\n", + " )\n", + "\n", + "\n", + "# filter_only_questions_with_accepted_answers(df)\n", + "# filter_scores_above(df)\n", + "convert_html_to_text(df)\n", + "clean_tags(df)\n", + "\n", + "df[['Body', 'BodyClean', 'Tags', 'TagsClean']]\n", + "# print(df.shape)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "qyUqc31Z3Z9g", + "outputId": "18dce8b4-af26-49c9-ee73-6c677177b516" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Body \\\n", + "0

What does \"backprop\" mean? Is the \"backprop... \n", + "1

Does increasing the noise in data help to i... \n", + "2

\"Backprop\" is the same as \"backpropagation\"... \n", + "3

When you're writing your algorithm, how do ... \n", + "4

Given the following definition of an intell... \n", + "... ... \n", + "23174

The purpose of evaluating the state and act... \n", + "23175

In machine translation, convolution is a te... \n", + "23176

One of the key features of ChatGPT is its a... \n", + "23177

Given a neural network model for Covid-19 c... \n", + "23178

My question is more related to the fundamen... \n", + "\n", + " BodyClean \\\n", + "0 What does \"backprop\" mean? Is the \"backprop\" t... \n", + "1 Does increasing the noise in data help to impr... \n", + "2 \"Backprop\" is the same as \"backpropagation\": i... \n", + "3 When you're writing your algorithm, how do you... \n", + "4 Given the following definition of an intellige... \n", + "... ... \n", + "23174 The purpose of evaluating the state and action... \n", + "23175 In machine translation, convolution is a techn... \n", + "23176 One of the key features of ChatGPT is its abil... \n", + "23177 Given a neural network model for Covid-19 clas... \n", + "23178 My question is more related to the fundamental... \n", + "\n", + " Tags \\\n", + "0 ... \n", + "4 \n", + "... ... \n", + "23174 NaN \n", + "23175 NaN \n", + "23176 NaN \n", + "23177 \n", + "23178 \n", + "\n", + " TagsClean \n", + "0 neural networks, backpropagation, terminology,... \n", + "1 neural networks, machine learning, statistical... \n", + "2 NaN \n", + "3 neural networks, hyperparameter optimization, ... \n", + "4 philosophy, definitions, intelligent agent \n", + "... ... \n", + "23174 NaN \n", + "23175 NaN \n", + "23176 NaN \n", + "23177 neural networks, homework \n", + "23178 search, constraint satisfaction problems \n", + "\n", + "[23179 rows x 4 columns]" + ], + "text/html": [ + "\n", + "

\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BodyBodyCleanTagsTagsClean
0<p>What does \"backprop\" mean? Is the \"backprop...What does \"backprop\" mean? Is the \"backprop\" t...<neural-networks><backpropagation><terminology...neural networks, backpropagation, terminology,...
1<p>Does increasing the noise in data help to i...Does increasing the noise in data help to impr...<neural-networks><machine-learning><statistica...neural networks, machine learning, statistical...
2<p>\"Backprop\" is the same as \"backpropagation\"...\"Backprop\" is the same as \"backpropagation\": i...NaNNaN
3<p>When you're writing your algorithm, how do ...When you're writing your algorithm, how do you...<neural-networks><hyperparameter-optimization>...neural networks, hyperparameter optimization, ...
4<p>Given the following definition of an intell...Given the following definition of an intellige...<philosophy><definitions><intelligent-agent>philosophy, definitions, intelligent agent
...............
23174<p>The purpose of evaluating the state and act...The purpose of evaluating the state and action...NaNNaN
23175<p>In machine translation, convolution is a te...In machine translation, convolution is a techn...NaNNaN
23176<p>One of the key features of ChatGPT is its a...One of the key features of ChatGPT is its abil...NaNNaN
23177<p>Given a neural network model for Covid-19 c...Given a neural network model for Covid-19 clas...<neural-networks><homework>neural networks, homework
23178<p>My question is more related to the fundamen...My question is more related to the fundamental...<search><constraint-satisfaction-problems>search, constraint satisfaction problems
\n", + "

23179 rows × 4 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 220 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "This groups questions with answers so that a row with a question also has a column with an answer. It then creates an AcceptedAnswerFlag column that is True if the answer was accepted by the person who asked the question. Changing the `number_of_results` variable will limit the number of answers you want to keep." + ], + "metadata": { + "id": "C09Bwdw-44PZ" + } + }, + { + "cell_type": "code", + "source": [ + "questions = df[df.PostTypeId == 1]\n", + "answers = df[df.PostTypeId == 2]\n", + "\n", + "df = pd.merge(\n", + "\tquestions,\n", + "\tanswers[\n", + " ['Id', 'CreationDate', 'Score', 'ViewCount', 'CommentCount', \n", + " 'ContentLicense', 'TagsClean', 'BodyClean', 'ParentId']\n", + "\t],\n", + "\tleft_on='Id',\n", + "\tright_on='ParentId',\n", + "\tsuffixes=('_q', '_a'),\n", + "\thow='left'\n", + ")\n", + "\n", + "df['AcceptedAnswerFlag'] = df.apply(lambda row: row['Id_a'] == row['AcceptedAnswerId'], axis=1)\n", + "\n", + "df = df.rename(columns={\n", + "\t'BodyClean_q': 'Question',\n", + "\t'Score_q': 'QuestionScore',\n", + "\t'TagsClean_q': 'QuestionTags',\n", + "\t'BodyClean_a': 'Answer',\n", + "\t'Score_a': \"AnswerScore\",\n", + "\t'ContentLicense_q':'QuestionContentLicense',\n", + "\t'ContentLicense_a':'AnswerContentLicense',\n", + "\t'CreationDate_q': 'CreationDate'\n", + "})\n", + "\n", + "## Set the number of results to a lower number to only return top N rated Answers.\n", + "number_of_results = 25\n", + "df = (\n", + " df.sort_values(by=['AcceptedAnswerFlag', 'AnswerScore'], ascending=[False, False])\n", + " .groupby('Question')\n", + " .head(number_of_results)\n", + " .reset_index(drop=True)\n", + " )\n", + "\n", + "df[['Id_q', 'Question', 'QuestionScore', 'QuestionTags', 'Id_a', 'Answer', 'AnswerScore', 'AcceptedAnswerFlag']].head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 337 + }, + "id": "Bgz2fZ9k43Ab", + "outputId": "28896d69-03cd-4877-fdfb-ae48dafa4ff3" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Id_q Question QuestionScore \\\n", + "0 1768 In Portal 2 we see that AI's can be \"killed\" b... 175 \n", + "1 10623 What is self-supervised learning in machine le... 91 \n", + "2 111 Obviously, self-driving cars aren't perfect, s... 100 \n", + "3 14224 If the original purpose for developing AI was ... 69 \n", + "4 1479 Do scientists or research experts know from th... 94 \n", + "\n", + " QuestionTags Id_a \\\n", + "0 philosophy, decision theory, mythology of ai, ... 1769.0 \n", + "1 machine learning, comparison, supervised learn... 10624.0 \n", + "2 philosophy, ethics, autonomous vehicles, decis... 1790.0 \n", + "3 philosophy, social, explainable ai 14247.0 \n", + "4 neural networks, deep learning, convolutional ... 4044.0 \n", + "\n", + " Answer AnswerScore \\\n", + "0 This classic problem exhibits a basic misunder... 146.0 \n", + "1 Introduction\\nThe term self-supervised learnin... 90.0 \n", + "2 \\nHow could self-driving cars make ethical dec... 76.0 \n", + "3 As argued by Selvaraju et al., there are three... 75.0 \n", + "4 There are many approaches that aim to make a t... 69.0 \n", + "\n", + " AcceptedAnswerFlag \n", + "0 True \n", + "1 True \n", + "2 True \n", + "3 True \n", + "4 True " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Id_qQuestionQuestionScoreQuestionTagsId_aAnswerAnswerScoreAcceptedAnswerFlag
01768In Portal 2 we see that AI's can be \"killed\" b...175philosophy, decision theory, mythology of ai, ...1769.0This classic problem exhibits a basic misunder...146.0True
110623What is self-supervised learning in machine le...91machine learning, comparison, supervised learn...10624.0Introduction\\nThe term self-supervised learnin...90.0True
2111Obviously, self-driving cars aren't perfect, s...100philosophy, ethics, autonomous vehicles, decis...1790.0\\nHow could self-driving cars make ethical dec...76.0True
314224If the original purpose for developing AI was ...69philosophy, social, explainable ai14247.0As argued by Selvaraju et al., there are three...75.0True
41479Do scientists or research experts know from th...94neural networks, deep learning, convolutional ...4044.0There are many approaches that aim to make a t...69.0True
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 221 + } + ] + }, + { + "cell_type": "code", + "source": [ + "testing_id = df.Id_q.mode()[0]\n", + "df[(df.Id_q == testing_id) | (df.ParentId_a == testing_id)][['Id_q', 'Question', 'ParentId_a', 'AcceptedAnswerId', 'Id_a', 'Answer', 'AnswerScore', 'AcceptedAnswerFlag']]\n", + "# df[['Id_q', 'Question', 'ParentId_a', 'AcceptedAnswerId', 'Id_a', 'Answer', 'AnswerScore', 'AcceptedAnswerFlag']]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 645 + }, + "id": "eds1K8WL9QPo", + "outputId": "bc526503-d6dd-428f-fa98-ad419d26a7dc" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Id_q Question ParentId_a \\\n", + "7 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "3662 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "3713 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "3788 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "3821 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "3882 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "4389 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "4849 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "4850 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "5763 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "5764 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "5765 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "7462 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "7463 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "7464 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "7465 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "7466 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "7467 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "9481 15730 As a human being, we can think infinity. In pr... 15730.0 \n", + "\n", + " AcceptedAnswerId Id_a \\\n", + "7 15744 15744.0 \n", + "3662 15744 15753.0 \n", + "3713 15744 15747.0 \n", + "3788 15744 15756.0 \n", + "3821 15744 15758.0 \n", + "3882 15744 15762.0 \n", + "4389 15744 15783.0 \n", + "4849 15744 15740.0 \n", + "4850 15744 15803.0 \n", + "5763 15744 15768.0 \n", + "5764 15744 15810.0 \n", + "5765 15744 15943.0 \n", + "7462 15744 15779.0 \n", + "7463 15744 15787.0 \n", + "7464 15744 15801.0 \n", + "7465 15744 15930.0 \n", + "7466 15744 15934.0 \n", + "7467 15744 15938.0 \n", + "9481 15744 15931.0 \n", + "\n", + " Answer AnswerScore \\\n", + "7 I think this is a fairly common misconception ... 62.0 \n", + "3662 I think your premise is flawed.\\nYou seem to a... 19.0 \n", + "3713 TL;DR: The subtleties of infinity are made app... 12.0 \n", + "3788 In Haskell, you can type:\\nprint [1..]\\nand it... 9.0 \n", + "3821 I believe humans can be said to understand inf... 8.0 \n", + "3882 (There's a summary at the bottom for those who... 7.0 \n", + "4389 Then premise assumes that humans \"understand\" ... 4.0 \n", + "4849 By adding some rules for infinity in arithmeti... 3.0 \n", + "4850 I think the concept that is missing in the dis... 3.0 \n", + "5763 Computers don't understand \"infinity\" or even ... 2.0 \n", + "5764 The Questions That Computers Can Never Answer ... 2.0 \n", + "5765 John Doucette's answer covers my thoughts on t... 2.0 \n", + "7462 I would think that a computer couldn’t underst... 1.0 \n", + "7463 The \"concept\" of infinity is 1 thing to unders... 1.0 \n", + "7464 Just food for thought: how about if we try to ... 1.0 \n", + "7465 Its arguable if we humans understand infinity.... 1.0 \n", + "7466 Well -- just to touch on the question of peopl... 1.0 \n", + "7467 Humans certainly don't understand infinity. Cu... 1.0 \n", + "9481 I think the property humans have which compute... 0.0 \n", + "\n", + " AcceptedAnswerFlag \n", + "7 True \n", + "3662 False \n", + "3713 False \n", + "3788 False \n", + "3821 False \n", + "3882 False \n", + "4389 False \n", + "4849 False \n", + "4850 False \n", + "5763 False \n", + "5764 False \n", + "5765 False \n", + "7462 False \n", + "7463 False \n", + "7464 False \n", + "7465 False \n", + "7466 False \n", + "7467 False \n", + "9481 False " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Id_qQuestionParentId_aAcceptedAnswerIdId_aAnswerAnswerScoreAcceptedAnswerFlag
715730As a human being, we can think infinity. In pr...15730.01574415744.0I think this is a fairly common misconception ...62.0True
366215730As a human being, we can think infinity. In pr...15730.01574415753.0I think your premise is flawed.\\nYou seem to a...19.0False
371315730As a human being, we can think infinity. In pr...15730.01574415747.0TL;DR: The subtleties of infinity are made app...12.0False
378815730As a human being, we can think infinity. In pr...15730.01574415756.0In Haskell, you can type:\\nprint [1..]\\nand it...9.0False
382115730As a human being, we can think infinity. In pr...15730.01574415758.0I believe humans can be said to understand inf...8.0False
388215730As a human being, we can think infinity. In pr...15730.01574415762.0(There's a summary at the bottom for those who...7.0False
438915730As a human being, we can think infinity. In pr...15730.01574415783.0Then premise assumes that humans \"understand\" ...4.0False
484915730As a human being, we can think infinity. In pr...15730.01574415740.0By adding some rules for infinity in arithmeti...3.0False
485015730As a human being, we can think infinity. In pr...15730.01574415803.0I think the concept that is missing in the dis...3.0False
576315730As a human being, we can think infinity. In pr...15730.01574415768.0Computers don't understand \"infinity\" or even ...2.0False
576415730As a human being, we can think infinity. In pr...15730.01574415810.0The Questions That Computers Can Never Answer ...2.0False
576515730As a human being, we can think infinity. In pr...15730.01574415943.0John Doucette's answer covers my thoughts on t...2.0False
746215730As a human being, we can think infinity. In pr...15730.01574415779.0I would think that a computer couldn’t underst...1.0False
746315730As a human being, we can think infinity. In pr...15730.01574415787.0The \"concept\" of infinity is 1 thing to unders...1.0False
746415730As a human being, we can think infinity. In pr...15730.01574415801.0Just food for thought: how about if we try to ...1.0False
746515730As a human being, we can think infinity. In pr...15730.01574415930.0Its arguable if we humans understand infinity....1.0False
746615730As a human being, we can think infinity. In pr...15730.01574415934.0Well -- just to touch on the question of peopl...1.0False
746715730As a human being, we can think infinity. In pr...15730.01574415938.0Humans certainly don't understand infinity. Cu...1.0False
948115730As a human being, we can think infinity. In pr...15730.01574415931.0I think the property humans have which compute...0.0False
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 222 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Create JSONL version of Dataframe\n", + "This groups the dataframe by question data and creates nested list of Answers for that group. The entire list contains individual JSON objects, each representing a single question in the dataset with a key, Answers, which contains a list of dictionaries for each answer to the question." + ], + "metadata": { + "id": "gXgpXEO7DCbj" + } + }, + { + "cell_type": "code", + "source": [ + "j = (\n", + "\tdf.groupby(['Title', 'Question', 'QuestionScore', 'QuestionTags', 'QuestionContentLicense', 'DataSource', 'CreationDate'])\n", + "\t.apply(lambda x: x[['Answer', 'AnswerScore', 'AcceptedAnswerFlag']].to_dict('records'))\n", + "\t.reset_index()\n", + "\t.rename(columns={0:'Answers'})\n", + "\t.to_json(orient='records')\n", + ")\n", + "\n", + "data = json.loads(j)\n", + "\n", + "for post in data:\n", + " if len(post.get('Answers')) >= 4:\n", + " print(\n", + " json.dumps(post, indent=4)\n", + " )\n", + " break" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OBR58MSRzAMP", + "outputId": "c7da1e6c-3a97-465d-c9ba-7e055cb0d751" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{\n", + " \"Title\": \"1 hidden layer with 1000 neurons vs. 10 hidden layers with 100 neurons\",\n", + " \"Question\": \"These types of questions may be problem-dependent, but I have tried to find research that addresses the question whether the number of hidden layers and their size (number of neurons in each layer) really matter or not.\\nSo my question is, does it really matter if we for example have 1 large hidden layer of 1000 neurons vs. 10 hidden layers with 100 neurons each?\\n\",\n", + " \"QuestionScore\": 16,\n", + " \"QuestionTags\": \"neural networks\",\n", + " \"QuestionContentLicense\": \"CC BY-SA 3.0\",\n", + " \"DataSource\": \"https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/ai.stackexchange.com.7z&file=Posts.xml\",\n", + " \"CreationDate\": \"2017-05-04T13:06:37.990\",\n", + " \"Answers\": [\n", + " {\n", + " \"Answer\": \"Basically, having multiple layers (aka a deep network) makes your network more eager to recognize certain aspects of input data. For example, if you have the details of a house (size, lawn size, location etc.) as input and want to predict the price. The first layer may predict:\\n\\nBig area, higher price\\nSmall amount of bedrooms, lower price\\n\\nThe second layer might conclude:\\n\\nBig area + small amount of bedrooms = large bedrooms = +- effect\\n\\nYes, one layer can also 'detect' the stats, however it will require more neurons as it cannot rely on other neurons to do 'parts' of the total calculation required to detect that stat.\\nCheck out this answer\\n\",\n", + " \"AnswerScore\": 13.0,\n", + " \"AcceptedAnswerFlag\": true\n", + " },\n", + " {\n", + " \"Answer\": \"There are so many aspects.\\n1. Training:\\nTraining deep nets is a hard job due to the vanishing (rearly exploding) gradient problem. So building a 10x100 neural-net is not recommended.\\n2. Trained network performance:\\n\\nInformation loss:\\nThe classical usage of neural nets is the classification problem. Which means we want to get some well defined information from the data. (Ex. Is there a face in the picture or not.)\\nSo usually classification problem has a lot of input, and few output, whats more the size of the hidden layers are descend from input to output.\\nHowever, we loss information using less neurons layer by layer. (Ie. We cannot reproduce the original image based on the fact that is there a face on it or no.) So you must know that you loss information using 100 neurons if the size of the input is (lets say) 1000.\\nInformation complexity: However the deeper nets (as Tomas W mentioned) can fetch more complex information from the input data. Inspite of this its not recommended to use 10 fully connected layers. Its recommended to use convolutional/relu/maxpooling or other type of layers. Firest layers can compress the some essential part of the inputs. (Ex is there any line in a specific part of the picture) Second layers can say: There is a specific shape in this place in the picture. Etc etc.\\n\\nSo deeper nets are more \\\"clever\\\" but 10x100 net structure is a good choice.\\n\",\n", + " \"AnswerScore\": 4.0,\n", + " \"AcceptedAnswerFlag\": false\n", + " },\n", + " {\n", + " \"Answer\": \"If the problem you are solving is linearly separable, one layer of 1000 neurons can do better job than 10 layers with each of 100 neurons.\\nIf the problem is non linear and not convex, then you need deep neural nets. \\n\",\n", + " \"AnswerScore\": 1.0,\n", + " \"AcceptedAnswerFlag\": false\n", + " },\n", + " {\n", + " \"Answer\": \"\\nI think you have a confusion in the basics of the neural networks.\\n Every layer has a separate activation function and input/output\\n connection weights.\\n\\nThe output of the first hidden layer will be multiplied by a weight, processed by an activation function in the next layer and so on.\\nSingle layer neural networks are very limited for simple tasks, deeper NN can perform far better than a single layer. \\nHowever, do not use more than layer if your application is not fairly complex. In conclusion, 100 neurons layer does not mean better neural network than 10 layers x 10 neurons but 10 layers are something imaginary unless you are doing deep learning. start with 10 neurons in the hidden layer and try to add layers or add more neurons to the same layer to see the difference. learning with more layers will be easier but more training time is required.\\n\",\n", + " \"AnswerScore\": 0.0,\n", + " \"AcceptedAnswerFlag\": false\n", + " }\n", + " ]\n", + "}\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Save file\n", + "\n", + "Files can be saved as JSON, JSONL, CSV, or Parquet" + ], + "metadata": { + "id": "PlNjrpXaDm1_" + } + }, + { + "cell_type": "code", + "source": [ + "file_name = dataset_name\n", + "\n", + "def save_data(data: list, file_name: str, file_type: str = ['csv', 'json', 'jsonl', 'parquet']):\n", + " \"\"\"\n", + " Save Data to file\n", + "\n", + " Save Data list to file as either JSON or JSONL\n", + "\n", + " Parameters:\n", + " data (list): list of dictionaries\n", + " file_name (str): name of file (no extension)\n", + " jsonl (bool): to save file as either JSON or JSONL\n", + " \"\"\" \n", + " file_type = file_type.lower()\n", + "\n", + " if file_type == \"csv\" and isinstance(data, pd.DataFrame):\n", + " data.to_csv(f\"/content/{file_name}.csv\", index=False)\n", + "\n", + " elif file_type == \"json\" and isinstance(data, list):\n", + " print(\n", + " json.dumps(data, indent=4),\n", + " file=open(f\"/content/{file_name}.json\", 'w')\n", + " )\n", + "\n", + " elif file_type == \"jsonl\" and isinstance(data, list):\n", + " for item in data:\n", + " print(json.dumps(item), file=open(f\"/content/{file_name}.jsonl\", 'a'))\n", + "\n", + " elif file_type == \"parquet\" and isinstance(data, pd.DataFrame):\n", + " data.to_parquet(f\"/content/{file_name}.parquet\", index=False)\n", + " \n", + " else:\n", + " print(\"Data should be either of List type for JSON and JSONL, or Pandas Dataframes for CSV and Parquet\")\n", + "\n", + "\n", + "\n", + "# save_data(data=data, file_name=file_name, file_type='jsonl')\n", + "# save_data(data=df, file_name=file_name, file_type='parquet')" + ], + "metadata": { + "id": "CU0gWRGQDqIs", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "9646e475-cedd-46f1-f9b8-7eb1fbc703c7" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Data should be either of List type for JSON and JSONL, or Pandas Dataframes for CSV and Parquet\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Open-Assistant Data Scheme\n", + "\n", + "Testing putting the data into the Open-Assistant Data Scheme\n", + "\n", + "https://github.com/LAION-AI/Open-Assistant/blob/main/docs/data_schemas.md" + ], + "metadata": { + "id": "BdN3hKxtgH7f" + } + }, + { + "cell_type": "code", + "source": [ + "from typing import TypeVar, List, Dict, Any, Literal\n", + "from json import JSONEncoder\n", + "\n", + "T = TypeVar('T', bound='ConversationTreeNode')\n", + "\n", + "\n", + "class ConversationTreeNode:\n", + " text: str # The text of the node\n", + " role: Literal['prompter', 'assistant'] # Whether the node is a user prompt/follow-up or an assistant response\n", + " children: List[T] # The children of the node (if you have a linear conversation, this will be of length 0 or 1)\n", + " metadata: Dict[str, Any] # Node metadata (see below)\n", + "\n", + " def __init__(self, text: str, role: Literal['prompter', 'assistant'], children: List[T], metadata: Dict[str, Any]) -> None:\n", + " self.text = text\n", + " self.role = role\n", + " self.children = children\n", + " self.metadata = metadata\n", + "\n", + "\n", + "class ConversationTree:\n", + " root: ConversationTreeNode # The node containing the initial prompt\n", + " metadata: Dict[str, Any] # Tree metadata, different from root node metadata.\n", + "\n", + " def __init__(self, root: ConversationTreeNode, metadata: Dict[str, Any]) -> None:\n", + " self.root = root\n", + " self.metadata = metadata\n", + "\n", + "\n", + "# subclass JSONEncoder\n", + "class TreeEncoder(JSONEncoder):\n", + " def default(self, o):\n", + " return o.__dict__" + ], + "metadata": { + "id": "n8ubYQxegNSY" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "conversation_forest = []\n", + "\n", + "tree_metadata_map = {'Title': str, 'QuestionContentLicense': str, 'DataSource': str, 'CreationDate': str}\n", + "question_metadata_map = {'QuestionScore': int, 'QuestionTags': str}\n", + "answer_metadata_map = {'AnswerScore': int, 'AcceptedAnswerFlag': bool}\n", + "\n", + "\n", + "for item in data:\n", + " prompt = item.get('Question')\n", + " metadata = {k: v for k, v in item.items() if k in question_metadata_map}\n", + " root = ConversationTreeNode(text=prompt, role='prompter', children=[], metadata=metadata)\n", + "\n", + " for answer in item.get('Answers'):\n", + " response = answer.get('Answer')\n", + " metadata = {k: v for k, v in answer.items() if k in answer_metadata_map}\n", + " child = ConversationTreeNode(text=response, role='assistant', children=[], metadata=metadata)\n", + " root.children.append(child)\n", + " \n", + " metadata = {k: v for k, v in item.items() if k in tree_metadata_map}\n", + " conversation_tree = ConversationTree(root=root, metadata=metadata)\n", + " conversation_forest.append(conversation_tree)\n", + "\n", + "\n", + "conversation_forest_json = [\n", + " json.loads(TreeEncoder().encode(conversation_tree))\n", + " for conversation_tree in conversation_forest\n", + " ]\n", + "\n", + "\n", + "# print(json.dumps(conversation_forest_json[0], indent=4))\n", + "\n", + "\n", + "\n", + "print(\n", + " json.dumps(conversation_forest_json, indent=4),\n", + " file=open(f\"/content/{file_name}.json\", 'w')\n", + ")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "eE0fkytExSGl", + "outputId": "594632d6-f98c-49b8-af86-25f7f5e2ce06" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{\n", + " \"root\": {\n", + " \"text\": \"Science Fiction has frequently shown AI to be a threat to the very existence of mankind. AI systems have often been the antagonists in many works of fiction, from 2001: A Space Odyssey through to The Terminator and beyond.\\nThe Media seems to buy into this trope as well. And in recent years we have had people like Elon Musk warn us of the dangers of an impending AI revolution, stating that AI is more dangerous than nukes.\\nAnd, apparently, experts think that we will be seeing this AI revolution in the next 100 years.\\nHowever, from my (albeit limited) study of AI, I get the impression that they are all wrong. I am going to outline my understanding below, please correct me if I am wrong:\\n\\nFirstly, all of these things seem to be confusing Artificial Intelligence with Artificial Consciousness. AI is essentially a system to make intelligent decisions, whereas AC is more like the \\\"self-aware\\\" systems that are shown in science fiction.\\n\\nNot AI itself, but intelligence and intelligent decision-making algorithms are something we've been working with and enhancing since before computers have been around. Moving this over to an artificial framework is fairly easy. However, consciousness is still something we are learning about. My guess is we won't be able to re-create something artificially if we barely understand how it works in the real world.\\n\\nSo, my conclusion is that no AI system will be able to learn enough to start thinking for itself, and that all our warnings of AI are completely unjustified.\\n\\nThe real danger comes from AC, which we are a long, long way from realizing because we are still a long way off from defining exactly what consciousness is, let alone understanding it.\\n\\n\\n\\nSo, my question is, assuming that my understanding is correct, are any efforts are being made by companies or organizations that work with AI to correct these popular misunderstandings in sci-fi, the media, and/or the public?\\nOr are the proponents of AI ambivalent towards this public fear-mongering?\\nI understand that the fear mongering is going to remain popular for some time, as bad news sells better than good news. I am just wondering if the general attitude from AI organizations is to ignore this popular misconception, or whether a concerted effort is being made to fight against these AI myths (but unfortunately nobody in the media is listening or cares).\\n\",\n", + " \"role\": \"prompter\",\n", + " \"children\": [\n", + " {\n", + " \"text\": \"Nothing. \\nIts in almost everyone's favor for it to stay that way financially. Having non-technical individuals associate AI with terminators makes a perception that the field has greater capabilities than it does $\\\\rightarrow$ this leads to grants, funding, etc... \\nIs there any negative? Yes. Misconceptions always have drawbacks. We see the creation of dumb ethics boards and such cough cough Elon Musk.\\nBut if history has anything to say about this, as the field gains popularity (which it is dnagerously quick), information will spread by definition, and eventually misconceptions will be laid to rest.\\nNote that this answer is biased and based upon my own opinions\\n\",\n", + " \"role\": \"assistant\",\n", + " \"children\": [],\n", + " \"metadata\": {\n", + " \"AnswerScore\": 2.0,\n", + " \"AcceptedAnswerFlag\": true\n", + " }\n", + " }\n", + " ],\n", + " \"metadata\": {\n", + " \"QuestionScore\": 5,\n", + " \"QuestionTags\": \"social, artificial consciousness\"\n", + " }\n", + " },\n", + " \"metadata\": {\n", + " \"Title\": \"\\\"AI will kill us all! The machines will rise up!\\\" - what is being done to dispel such myths?\",\n", + " \"QuestionContentLicense\": \"CC BY-SA 4.0\",\n", + " \"DataSource\": \"https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/ai.stackexchange.com.7z&file=Posts.xml\",\n", + " \"CreationDate\": \"2019-10-16T13:57:37.143\"\n", + " }\n", + "}\n" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/notebooks/data-argumentation/StackExchangeBuilder.md b/notebooks/data-argumentation/StackExchangeBuilder.md new file mode 100644 index 00000000..3a09dee3 --- /dev/null +++ b/notebooks/data-argumentation/StackExchangeBuilder.md @@ -0,0 +1,98 @@ +# StackExchange Builder + +StackExchange Builder is a notebook that downloads data from StackExchange data dumps +and converts it into different formats. It will parse the XML files, group questions and answers, can filter the dataset and puts the results into the Open-Assistant Data Scheme. +Files can be saved to either JSON, JSONL, Parquet, or CSV. + +--- +#### Sample Data Open-Assistant Data Scheme: + +Open-Assistant Data Scheme as outlined here: https://github.com/LAION-AI/Open-Assistant/blob/main/docs/data_schemas.md + +``` +{ + "root": { + "text": "Science Fiction has frequently shown AI to be a threat to the very existence of mankind. AI systems have often been the antagonists...", + "role": "prompter", + "children": [ + { + "text": "Nothing. \nIts in almost everyone's favor for it to stay that way financially. Having non-technical individuals associate AI with terminators...", + "role": "assistant", + "children": [], + "metadata": { + "AnswerScore": 2.0, + "AcceptedAnswerFlag": true + } + } + ], + "metadata": { + "QuestionScore": 5, + "QuestionTags": "social, artificial consciousness" + } + }, + "metadata": { + "Title": "\"AI will kill us all! The machines will rise up!\" - what is being done to dispel such myths?", + "QuestionContentLicense": "CC BY-SA 4.0", + "DataSource": "https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/ai.stackexchange.com.7z&file=Posts.xml", + "CreationDate": "2019-10-16T13:57:37.143" + } +} +``` + +--- +#### JSONL format +Each question and all related answers are on a single line in JSONL format. + +``` +{ + "Title": "1 hidden layer with 1000 neurons vs. 10 hidden layers with 100 neurons", + "Question": "These types of questions may be problem-dependent...", + "QuestionScore": 16, + "QuestionTags": "neural networks", + "QuestionContentLicense": "CC BY-SA 3.0", + "DataSource": "https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/ai.stackexchange.com.7z&file=Posts.xml", + "CreationDate": "2017-05-04T13:06:37.990", + "Answers": [ + { + "Answer": "Basically, having multiple layers (aka a deep network) makes your network more eager to recognize certain aspects of input data...", + "AnswerScore": 13.0, + "AcceptedAnswerFlag": true + }, + { + "Answer": "There are so many aspects.\n1. Training:\nTraining deep nets is a hard job due to the vanishing (rearly exploding) gradient problem...", + "AnswerScore": 4.0, + "AcceptedAnswerFlag": false + }, + { + "Answer": "If the problem you are solving is linearly separable, one layer of 1000 neurons can do better job...", + "AnswerScore": 1.0, + "AcceptedAnswerFlag": false + }, + { + "Answer": "\nI think you have a confusion in the basics of the neural networks.\n Every layer has a separate activation...", + "AnswerScore": 0.0, + "AcceptedAnswerFlag": false + } + ] +} +``` + + +#### Table/CSV/Parquet Format + +There are a lot more columns left over in the table format. `_q` and `_a` are suffixes indiciating if the column came from the question or answer table as leftover from a join statement. + +``` +| Id_q | Question | ParentId_a | AcceptedAnswerId | Id_a | Answer | AnswerScore | AcceptedAnswerFlag | +|------:|--------------------------------------------------:|-----------:|-----------------:|--------:|--------------------------------------------------:|------------:|-------------------:| +| 15730 | As a human being, we can think infinity. In pr... | 15730.0 | 15744 | 15744.0 | I think this is a fairly common misconception ... | 62.0 | True | +| 15730 | As a human being, we can think infinity. In pr... | 15730.0 | 15744 | 15753.0 | I think your premise is flawed.\nYou seem to a... | 19.0 | False | +| 15730 | As a human being, we can think infinity. In pr... | 15730.0 | 15744 | 15747.0 | TL;DR: The subtleties of infinity are made app... | 12.0 | False | +| 15730 | As a human being, we can think infinity. In pr... | 15730.0 | 15744 | 15756.0 | In Haskell, you can type:\nprint [1..]\nand it... | 9.0 | False | +``` + +--- + +## Contributing + +Feel free to contribute to this notebook. It's not perfect and additional functionality is planned. From ca8d3c8f8d8b595580c54c90d4f3a83feece5775 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Wed, 4 Jan 2023 13:45:05 +1100 Subject: [PATCH 032/121] Fix style --- .pre-commit-config.yaml | 2 +- docs/datasets/README.md | 15 +- openassistant/__init__.py | 0 openassistant/templates/README.md | 31 +++ openassistant/templates/requirements.txt | 2 + openassistant/templates/template.py | 206 ++++++++++++++++++ openassistant/utils/__init__.py | 0 openassistant/utils/configs.py | 15 ++ openassistant/utils/schemas/__init__.py | 0 .../utils/schemas/language_modeling.py | 12 + 10 files changed, 275 insertions(+), 8 deletions(-) create mode 100644 openassistant/__init__.py create mode 100644 openassistant/templates/README.md create mode 100644 openassistant/templates/requirements.txt create mode 100644 openassistant/templates/template.py create mode 100644 openassistant/utils/__init__.py create mode 100644 openassistant/utils/configs.py create mode 100644 openassistant/utils/schemas/__init__.py create mode 100644 openassistant/utils/schemas/language_modeling.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index de06c7ba..ff34b62c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,4 @@ -exclude: "build|stubs|^bot/templates/" +exclude: "build|stubs|^bot/templates/|openassistant/templates" default_language_version: python: python3 diff --git a/docs/datasets/README.md b/docs/datasets/README.md index e7cb264e..08c5fd13 100644 --- a/docs/datasets/README.md +++ b/docs/datasets/README.md @@ -11,11 +11,13 @@ and then running: ```python from datasets import load_dataset -dataset = load_dataset("OpenAssistant/") +dataset = load_dataset("OpenAssistant/{dataset-name}") ``` See the instructions below if you'd like to contribute a new dataset to the project. +## Adding a new dataset + ## Uploading a dataset to the Hugging Face Hub Adding a new dataset for the OpenAssistant project typically involves the following steps: @@ -29,8 +31,8 @@ Adding a new dataset for the OpenAssistant project typically involves the follow To upload a dataset to the OpenAssistant organization, you first need to: -* Create a [Hugging Face account](https://huggingface.co/join) (it's free) -* Join the [OpenAssistant organization](https://huggingface.co/OpenAssistant) by clicking on the _Request to join this org_ button on the top right-hand side +- Create a [Hugging Face account](https://huggingface.co/join) (it's free) +- Join the [OpenAssistant organization](https://huggingface.co/OpenAssistant) by clicking on the _Request to join this org_ button on the top right-hand side By default, your [role](https://huggingface.co/docs/hub/organizations-security#access-control-in-organizations) in the organization is `contributor`, which gives you write access to any datasets that you create (and only those). If you'd like to make changes to other datasets, [open a discussion or Hub pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions). @@ -76,8 +78,8 @@ python -m pip install datasets We've created a [Gradio application](https://huggingface.co/spaces/OpenAssistant/dataset-generator) on Hugging Face Spaces that will create a new dataset repository for you with the following template files: -* A dataset loading script -* A dataset card +- A dataset loading script +- A dataset card Simply provide the name of the new dataset and your access token from Step 1, and you're good to go! @@ -97,5 +99,4 @@ from datasets import load_dataset load_dataset("OpenAssistant/my_dataset") ``` -Congratulations - you've now added a dataset to the Hub! - +Congratulations - you've now added a dataset to the OpenAssistant org! diff --git a/openassistant/__init__.py b/openassistant/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/openassistant/templates/README.md b/openassistant/templates/README.md new file mode 100644 index 00000000..b7899eb7 --- /dev/null +++ b/openassistant/templates/README.md @@ -0,0 +1,31 @@ +# Dataset instructions for {dataset_name} + +## Setup + +```bash +python -m pip install -r requirements.txt +``` + +## Download + +```bash +python download.py +``` + +## Preprocess + +```bash +python preprocess.py +``` + +## Load + +```bash +python load.py +``` + +## Upload + +```bash +python upload.py +``` diff --git a/openassistant/templates/requirements.txt b/openassistant/templates/requirements.txt new file mode 100644 index 00000000..74fb5166 --- /dev/null +++ b/openassistant/templates/requirements.txt @@ -0,0 +1,2 @@ +datasets>=2.8 +typer diff --git a/openassistant/templates/template.py b/openassistant/templates/template.py new file mode 100644 index 00000000..b6c52871 --- /dev/null +++ b/openassistant/templates/template.py @@ -0,0 +1,206 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenAssistant Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This template serves as a starting point for contributing a dataset to the OpenAssistant repo. + +When modifying it for your dataset, look for TODO items that offer specific instructions. + +To create a dataset loading script you will create a class and implement 3 methods: + * `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object. + * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associates local data with each split. + * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`. + +Full documentation on writing dataset loading scripts can be found here: +https://huggingface.co/docs/datasets/dataset_script + +This template is adapted from the one provided by BigScience's BigBIO library: +https://github.com/bigscience-workshop/biomedical/blob/main/templates/template.py + +TODO: Before submitting your script, delete this docstring and replace it with a description of your dataset. +""" + +import os +from typing import Dict, List, Tuple + +import datasets + +from openassistant.utils.configs import OpenAssistantConfig + +# TODO: import the schema that fits your dataset: +from openassistant.utils.schemas import + +# TODO: Add BibTeX citation where appropriate +_CITATION = """\ +@article{, + author = {}, + title = {}, + journal = {}, + volume = {}, + year = {}, + url = {}, + doi = {}, + biburl = {}, + bibsource = {} +} +""" + +# TODO: create a module level variable with your dataset name (should match the script name) +# E.g. The Pile: [dataset_name] --> the_pile +_DATASETNAME = "[dataset_name]" +# TODO: create a pretty display name for your dataset +_DISPLAYNAME = "Dataset Name" + +# TODO: Add a description of the dataset here +# You can copy an official description +_DESCRIPTION = """\ +This dataset is designed for XXX NLP task. +""" + +# TODO: Add a link to an official homepage for the dataset here (if possible) +_HOMEPAGE = "" + +# TODO: Add the licence for the dataset here (if possible) +# Note that this doesn't have to be a common open source license. +# Some datasets have custom licenses. In this case, simply put the full license terms +# into `_LICENSE` +_LICENSE = "" + +# TODO: Add links to the URLs needed to download your dataset files. +# This variable can be a relative path for datasets whose files need to be +# manually downloaded or preprocessed in advance. + +# For publicly available datasets you will most likely end up passing these URLs to dl_manager in _split_generators. +# However, if you need to access different files for each config you can have multiple entries in this dict. +# This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method) +_URLS = { + _DATASETNAME: "url or list of urls or relative path like ./data ", +} + +# TODO: add supported task by dataset. One dataset may support multiple tasks +_SUPPORTED_TASKS = [] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] + +# TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0" +# This version doesn't have to be consistent with semantic versioning. Anything that is +# provided by the original dataset as a version goes. +_VERSION = "" + + +# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case +# Append "Dataset" to the class name: ThePile --> ThePileDataset +class NewDataset(datasets.GeneratorBasedBuilder): + """TODO: Short description of my dataset.""" + + VERSION = datasets.Version(_VERSION) + + # You will be able to load each dataset with + # dataset = datasets.load_dataset('my_dataset') + + # TODO: For each dataset, implement a config for each subset; + # If a dataset contains more than one subset, implement a config for EACH of them. + # Each of them should contain: + # - name: should be unique for each dataset config eg. the_pile_[schema_name] + # - version: VERSION + # - description: one line description for the dataset + # - schema: open_assistant_[schema_name] + # - subset_id: subset id is the canonical name for the dataset (eg. the_pile) + # where [schema_name] = (language_modeling) + + BUILDER_CONFIGS = [ + OpenAssistantConfig( + name=f"{_DATASETNAME}_[schema_name]", + version=VERSION, + description=f"OpenAssistant dataset config for {_DATASETNAME}", + schema_name="[schema_name]", + subset_id=_DATASETNAME, + ) + ] + + DEFAULT_CONFIG_NAME = _DATASETNAME + + def _info(self) -> datasets.DatasetInfo: + # TODO: Implement the schema for your dataset here. + raise NotImplementedError() + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration + + # If you need to access a config choice, that will be in self.config.name + + # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs; many examples use the download_and_extract method; see the DownloadManager docs here: https://huggingface.co/docs/datasets/package_reference/builder_classes.html#datasets.DownloadManager + + # dl_manager can accept any type of nested list/dict and will give back the same structure with the url replaced with the path to local files. + + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + # Not all datasets have predefined canonical train/val/test splits. + # If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data. + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": os.path.join(data_dir, "train.jsonl"), + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": os.path.join(data_dir, "test.jsonl"), + "split": "test", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": os.path.join(data_dir, "dev.jsonl"), + "split": "dev", + }, + ), + ] + + # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` + + # TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs. + + def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. + + # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example. + + # NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files + + if self.config.schema == "[schema_name]": + # TODO: yield (key, example) tuples in the given schema + for key, example in thing: + yield key, example + +# This allows you to run your dataloader with `python [dataset_name].py` during development +# TODO: Remove this before making your PR +if __name__ == "__main__": + datasets.load_dataset(__file__) diff --git a/openassistant/utils/__init__.py b/openassistant/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/openassistant/utils/configs.py b/openassistant/utils/configs.py new file mode 100644 index 00000000..2391f66b --- /dev/null +++ b/openassistant/utils/configs.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- +from dataclasses import dataclass + +import datasets + + +@dataclass +class OpenAssistantConfig(datasets.BuilderConfig): + """BuilderConfig for OpenAssistant datasets.""" + + name: str = None + version: datasets.Version = None + description: str = None + schema: str = None + subset_id: str = None diff --git a/openassistant/utils/schemas/__init__.py b/openassistant/utils/schemas/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/openassistant/utils/schemas/language_modeling.py b/openassistant/utils/schemas/language_modeling.py new file mode 100644 index 00000000..0b2072a5 --- /dev/null +++ b/openassistant/utils/schemas/language_modeling.py @@ -0,0 +1,12 @@ +# -*- coding: utf-8 -*- +""" +Language Modeling Schema +""" +import datasets + +features = datasets.Features( + { + "text": datasets.Value("string"), + "meta": [datasets.Value("string")], + } +) From da79aa04a0e4d4293b1427395e4a4a5770ea577d Mon Sep 17 00:00:00 2001 From: Bobak Hashemi Date: Tue, 3 Jan 2023 21:45:16 -0500 Subject: [PATCH 033/121] Cleaned up default argument logic. --- model/reward/instructor/trainer.py | 9 +++------ model/reward/instructor/utils.py | 9 ++++++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/model/reward/instructor/trainer.py b/model/reward/instructor/trainer.py index 124c28f8..2eee8b8d 100644 --- a/model/reward/instructor/trainer.py +++ b/model/reward/instructor/trainer.py @@ -168,7 +168,7 @@ if __name__ == "__main__": loss_function=training_conf["loss"], learning_rate=training_conf["learning_rate"], # half_precision_backend="apex", - fp16=training_conf["fp16"] if "fp16" in training_conf else True, + fp16=training_conf["fp16"], gradient_checkpointing=training_conf["gradient_checkpointing"], gradient_accumulation_steps=training_conf["gradient_accumulation_steps"], per_device_train_batch_size=training_conf["per_device_train_batch_size"], @@ -180,7 +180,7 @@ if __name__ == "__main__": evaluation_strategy="steps", eval_steps=training_conf["eval_steps"], save_steps=1000, - report_to="wandb", + report_to="local", ) train_datasets, evals = [], {} if "webgpt" in training_conf["datasets"]: @@ -196,10 +196,7 @@ if __name__ == "__main__": evals["hfsummary"] = sum_eval train = ConcatDataset(train_datasets) - if "tokenizer_name" in training_conf: - tokenizer = get_tokenizer(training_conf["tokenizer_name"]) - else: - tokenizer = get_tokenizer(model_name) + tokenizer = get_tokenizer(training_conf["tokenizer_name"]) if "rankgen" in model_name: collate_fn = RankGenCollator(tokenizer, max_length=training_conf["max_length"]) diff --git a/model/reward/instructor/utils.py b/model/reward/instructor/utils.py index 780ac9c8..7946fbb2 100644 --- a/model/reward/instructor/utils.py +++ b/model/reward/instructor/utils.py @@ -71,6 +71,10 @@ def freeze_top_n_layers(model, target_layers): def argument_parsing(parser): + args = parser.parse_args() + with open(args.config, "r", encoding="utf-8") as f: + training_conf = yaml.safe_load(f.read()) + default_params = { "num_train_epochs": 4, "learning_rate": 3e-5, @@ -82,10 +86,9 @@ def argument_parsing(parser): "gradient_accumulation_steps": 8, "gradient_checkpointing": False, "datasets": ["webgpt"], + "fp16": True, + "tokenizer_name": training_conf["model_name"], } - args = parser.parse_args() - with open(args.config, "r", encoding="utf-8") as f: - training_conf = yaml.safe_load(f.read()) params = {**default_params, **training_conf} params["gradient_accumulation_steps"] = int(params["gradient_accumulation_steps"]) From 000a9083d0033bcbc36e429c2bf6bcefec16794a Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Wed, 4 Jan 2023 15:15:41 +1100 Subject: [PATCH 034/121] Fix style --- docs/datasets.md | 349 +++++++++++++++++++++++ docs/datasets/README.md | 102 ------- openassistant/dev-requirements.txt | 1 + openassistant/templates/README.md | 28 +- openassistant/templates/dataset_card.md | 28 ++ openassistant/templates/prepare.py | 8 + openassistant/templates/requirements.txt | 2 - 7 files changed, 390 insertions(+), 128 deletions(-) create mode 100644 docs/datasets.md delete mode 100644 docs/datasets/README.md create mode 100644 openassistant/dev-requirements.txt create mode 100644 openassistant/templates/dataset_card.md create mode 100644 openassistant/templates/prepare.py delete mode 100644 openassistant/templates/requirements.txt diff --git a/docs/datasets.md b/docs/datasets.md new file mode 100644 index 00000000..c5400807 --- /dev/null +++ b/docs/datasets.md @@ -0,0 +1,349 @@ +# Datasets + +The datasets for this project are currently hosted as loading scripts on the [Open-Assistant organization](https://huggingface.co/OpenAssistant) the Hugging Face Hub. Each of them can be loaded by first installing the 🤗 Datasets library: + +```bash +python -m pip install datasets +``` + +and then running: + +```python +from datasets import load_dataset + +dataset = load_dataset("OpenAssistant/{dataset-name}") +``` + +We use this GitHub repository to accept new submissions and standardize quality control. See the instructions below if you'd like to contribute a new dataset to the project. + +## Adding a new dataset + +### 0. Pre-Requisites + +Install Git and create a GitHub account prior to implementing a dataset; you can follow instructions to install Git [here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git). + +You will also need at least Python 3.8+. If you are installing Python, we recommend downloading [Anaconda](https://docs.anaconda.com/anaconda/install/index.html) to curate a python environment with necessary packages. **We strongly recommend Python 3.8+ for stability**. + +### 1. **Fork the OpenAssistant repository** + +Fork the `OpenAssistant`[repository](https://github.com/LAION-AI/Open-Assistant). To do this, click the link to the repository and click "Fork" in the upper-right corner. You should get an option to fork to your account, provided you are signed into Github. + +After you fork, clone the repository locally. You can do so as follows: + +```bash +git clone git@github.com:/OpenAssistant.git +cd OpenAssistant # enter the directory +``` + +Next, you want to set your `upstream` location to enable you to push/pull (add or receive updates). You can do so as follows: + +```bash +git remote add upstream git@github.com:LAION-AI/Open-Assistant.git +``` + +You can optionally check that this was set properly by running the following command: + +```bash +git remote -v +``` + +The output of this command should look as follows: + +```bash +origin git@github.com:/Open-Assistant.git (fetch) +origin git@github.com:/Open-Assistant.git (push) +upstream git@github.com:LAION-AI/Open-Assistant.git (fetch) +upstream git@github.com:LAION-AI/Open-Assistant.git (push) +``` + +If you do NOT have an `origin` for whatever reason, then run: + +```bash +git remote add origin git@github.com:/OpenAssistant.git +``` + +The goal of `upstream` is to keep your repository up-to-date to any changes that are made officially to the OpenAssistant repo. You can do this as follows by running the following commands: + +``` +git fetch upstream +git pull +``` + +Provided you have no _merge conflicts_, this will ensure the repo stays up-to-date as you make changes. However, before you make changes, you should make a custom branch to implement your changes. + +You can make a new branch as such: + +``` +git checkout -b +``` + +

Please do not make changes on the master branch!

+ +Always make sure you're on the right branch with the following command: + +``` +git branch +``` + +The correct branch will have a asterisk \* in front of it. + +### 2. **Create a development environment** + +You can make an environment in any way you choose to. We highlight two possible options: + +#### 2a) Create a conda environment + +The following instructions will create an Anaconda `openassistant` environment. + +- Install [anaconda](https://docs.anaconda.com/anaconda/install/) for your appropriate operating system. +- Run the following command while in the `biomedical` folder (you can pick your python version): + +```bash +conda create -n openassistant python=3.8 # Creates a conda env +conda activate openassistant # Activate your conda environment +cd openassistant +pip install -r dev-requirements.txt # Install this while in the openassistant folder +``` + +You can deactivate your environment at any time by either exiting your terminal or using `conda deactivate`. + +#### 2b) Create a venv environment + +Python 3.3+ has venv automatically installed; official information is found [here](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/). + +``` +python3 -m venv +source /bin/activate # activate environment +cd openassistant +pip install -r dev-requirements.txt # Install this while in the openassistant folder +``` + +Make sure your `pip` package points to your environment's source. + +### 3. Prepare a folder in `datasets` for your dataloader + +Make a new directory within the `openassistant/datasets` directory: + +```bash +mkdir openassistant/datasets/ +``` + +**NOTE**: Please use snake_case, i.e. lowercase letters and underscores when choosing a ``. + +Add an `__init__.py` file to this directory: + +```bash +touch openassistant/datasets//__init__.py +``` + +Next, copy the `template.py` script of `templates` into your dataset folder. This script has "TODOs" to fill in for your dataloading script: + +```bash +cp templates/template.py openassistant/datasets//.py +``` + +#### (Optional) Prepare local dataset files + +If your dataset files aren't publicly available via URLs (e.g. because you implemented a web scraper), you'll need to implement some extra logic to store and prepare the data locally prior to implementing a loading script in 🤗 Datasets. + +To do so, first copy the template script for dataset creation: + +```bash +cp templates/prepare.py openassistant/datasets// +``` + +Next, implement any logic that is needed to prepare a local version of the dataset files (by convention we store them in `datasets//data/`). Add any extra dependencies to a `requirements.txt` file and provide instructions on how to prepare the dataset files in a README: + +```bash +touch openassistant/datasets//requirements.txt +cp templates/README.py openassistant/datasets// +``` + +**Note:** Do not commit any dataset files to the OpenAssistant repo - all data will be hosted on the Hugging Face Hub. This step is needed for the data admins to be able to replicate the dataset creation process before pushing to the Hub. + +### 4. Implement your dataset + +To implement your dataloader, you will need to follow `template.py` and fill in all necessary TODOs. There are three key methods that are important: + +- `_info`: Specifies the schema of the expected dataloader +- `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split. +- `_generate_examples`: Create examples from data that conform to each schema defined in `_info`. + +For the `_info_` function, you will need to define `features` for your +`DatasetInfo` object. For each dataset config, choose the right schema from our list of examples. You can find the schemas in the [schemas directory](openassistant/utils/schemas/). + +You will use this schema in the `_generate_examples` return value. + +Populate the information in the dataset according to this schema; some fields may be empty. + +#### Example scripts + +TODO + +#### Running & Debugging + +You can run your data loader script during development by appending the following +statement to your code ([templates/template.py](templates/template.py) already includes this): + +```python +if __name__ == "__main__": + datasets.load_dataset(__file__) +``` + +If you want to use an interactive debugger during development, you will have to use +`breakpoint()` instead of setting breakpoints directly in your IDE. Most IDEs will +recognize the `breakpoint()` statement and pause there during debugging. If your preferred +IDE doesn't support this, you can always run the script in your terminal and debug with +`pdb`. + +### 5. Check if your dataloader works + +Make sure your dataset is implemented correctly by checking in python the following commands: + +```python +from datasets import load_dataset + +data = load_dataset("openassistant/datasets//.py", name="_") +``` + +Run these commands from the top level of the `OpenAssistant` repo. + +### 6. Create a dataset card + +Copy and fill out the template dataset card: + +```bash +cp templates/dataset_card.md openassistant/datasets//README.md +``` + +### 7. Format your code + +From the main directory, run the code quality checks via the following command: + +``` +pre-commit run --all-files +``` + +This runs the black formatter, isort, and lints to ensure that the code is readable and looks nice. Flake8 linting errors may require manual changes. + +### 8. Commit your changes + +First, commit your changes to the branch to "add" the work: + +``` +git add openassistant/datasets//*.py +git commit -m "A message describing your commits" +``` + +Then, run the following commands to incorporate any new changes in the master branch of datasets as follows: + +``` +git fetch upstream +git rebase upstream/main +``` + +**Run these commands in your custom branch**. + +Push these changes to **your fork** with the following command: + +``` +git push -u origin +``` + +### 9. **Make a pull request** + +Make a Pull Request to implement your changes on the main repository [here](https://github.com/LAION-AI/Open-Assistant/pulls). To do so, click "New Pull Request". Then, choose your branch from your fork to push into "base:main". + +When opening a PR, please link the [issue](https://github.com/LAION-AI/Open-Assistant/issues) corresponding to your dataset using [closing keywords](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue) in the PR's description, e.g. `resolves #17`. + +## [Admins] Uploading a dataset to the Hugging Face Hub + +Uploading a new dataset from `openassistant/datasets/` to the Hugging Face Hub typically involves the following steps: + +1. Setup +2. Create a new dataset repository +3. Copy a dataset loading script and dataset card +4. Upload to the Hub + +### 1. Setup + +To upload a dataset to the OpenAssistant organization, you first need to: + +- Create a [Hugging Face account](https://huggingface.co/join) (it's free) +- Join the [OpenAssistant organization](https://huggingface.co/OpenAssistant) by clicking on the _Request to join this org_ button on the top right-hand side + +Next, check that you're correctly logged in and that `git-lfs` is installed so that the dataset can be uploaded. To log in, create a **write access token** that can be found under your Hugging Face profile (icon in the top right corner on [hf.co](http://hf.co/), then Settings -> Access Tokens -> User Access Tokens -> New Token. Alternatively, you can go to [your token settings](https://huggingface.co/settings/tokens) directly. + +Once you've created a token, run: + +```bash +huggingface-cli login +``` + +in a terminal, or case you're working in a notebook + +```python +from huggingface_hub import notebook_login + +notebook_login() +``` + +You can then copy-paste your token to log in locally. + +Next, let's make sure that `git-lfs` is correctly installed. To do so, simply run: + +```bash +git-lfs -v +``` + +The output should show something like `git-lfs/2.13.2 (GitHub; linux amd64; go 1.15.4)`. If your console states that the `git-lfs` command was not found, please make sure to install it [here](https://git-lfs.github.com/) or simply via: + +```bash +sudo apt-get install git-lfs +git config --global user.email "you@example.com" +git config --global user.name "Your Name" +``` + +The final step of the setup is to install the 🤗 Datasets library by running: + +```bash +python -m pip install datasets +``` + +### 2. Create a new dataset repository + +Follow [this guide](https://huggingface.co/docs/datasets/upload_dataset) for instructions on creating a new dataset repo on the Hub. Use the same snake_case name as the dataset in `openassistant/datasets/`. + +Once you've created the dataset repo, clone it by running: + +```bash +git clone https://huggingface.co/datasets/OpenAssistant/ +cd +``` + +### 3. Copy a dataset loading script and dataset card + +Next, copy the loading script and dataset card to your repo: + +```bash +cp openassistant/datasets//.py . +cp openassistant/datasets//README.md . +``` + +#### (Optional) Prepare local dataset files + +If the dataset files of `openassistant/datasets/` aren't public, you'll need to run the `openassistant/datasets//prepare.py` script to create them. Store them in the same directory that is specified by the loading script (`data` by default). + +### 4. Upload to the Hub + +Once the dataset script and card are ready, use Git to push them to the Hub (along with any data files you may need). + +At this point, you can load the dataset by running: + +```python +from datasets import load_dataset + +load_dataset("OpenAssistant/{dataset_name}") +``` + +Congratulations - you've now added a dataset to the OpenAssistant org! diff --git a/docs/datasets/README.md b/docs/datasets/README.md deleted file mode 100644 index 08c5fd13..00000000 --- a/docs/datasets/README.md +++ /dev/null @@ -1,102 +0,0 @@ -# Datasets - -The datasets for this project are currently hosted under the [Open-Assistant organization](https://huggingface.co/OpenAssistant) the Hugging Face Hub. Each of them can be loaded by first installing the 🤗 Datasets library: - -```bash -python -m pip install datasets -``` - -and then running: - -```python -from datasets import load_dataset - -dataset = load_dataset("OpenAssistant/{dataset-name}") -``` - -See the instructions below if you'd like to contribute a new dataset to the project. - -## Adding a new dataset - -## Uploading a dataset to the Hugging Face Hub - -Adding a new dataset for the OpenAssistant project typically involves the following steps: - -1. Setup -2. Create a new dataset repository -3. Create a dataset loading script and dataset card -4. Upload to the Hub - -### 1. Setup - -To upload a dataset to the OpenAssistant organization, you first need to: - -- Create a [Hugging Face account](https://huggingface.co/join) (it's free) -- Join the [OpenAssistant organization](https://huggingface.co/OpenAssistant) by clicking on the _Request to join this org_ button on the top right-hand side - -By default, your [role](https://huggingface.co/docs/hub/organizations-security#access-control-in-organizations) in the organization is `contributor`, which gives you write access to any datasets that you create (and only those). If you'd like to make changes to other datasets, [open a discussion or Hub pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions). - -Next, check that you're correctly logged in and that `git-lfs` is installed so that the dataset can be uploaded. To log in, create a **write access token** that can be found under your Hugging Face profile (icon in the top right corner on [hf.co](http://hf.co/), then Settings -> Access Tokens -> User Access Tokens -> New Token. Alternatively, you can go to [your token settings](https://huggingface.co/settings/tokens) directly. - -Once you've created a token, run: - -```bash -huggingface-cli login -``` - -in a terminal, or case you're working in a notebook - -```python -from huggingface_hub import notebook_login - -notebook_login() -``` - -You can then copy-paste your token to log in locally. - -Next, let's make sure that `git-lfs` is correctly installed. To do so, simply run: - -```bash -git-lfs -v -``` - -The output should show something like `git-lfs/2.13.2 (GitHub; linux amd64; go 1.15.4)`. If your console states that the `git-lfs` command was not found, please make sure to install it [here](https://git-lfs.github.com/) or simply via: - -```bash -sudo apt-get install git-lfs -git config --global user.email "you@example.com" -git config --global user.name "Your Name" -``` - -The final step of the setup is to install the 🤗 Datasets library by running: - -```bash -python -m pip install datasets -``` - -### 2. Create a new dataset repository - -We've created a [Gradio application](https://huggingface.co/spaces/OpenAssistant/dataset-generator) on Hugging Face Spaces that will create a new dataset repository for you with the following template files: - -- A dataset loading script -- A dataset card - -Simply provide the name of the new dataset and your access token from Step 1, and you're good to go! - -### 3. Create a dataset loading script - -If you've followed Step 2, a template dataset loading script will have been created in your new dataset repository. Edit the script according to [this guide](https://huggingface.co/docs/datasets/dataset_script). Then fill out the missing details in the dataset card (the `README.md` file) - -### 4. Upload to the Hub - -Once the dataset script and card are ready, use Git to push them to the Hub (along with any data files you may need). - -At this point, you can load the dataset by running: - -```python -from datasets import load_dataset - -load_dataset("OpenAssistant/my_dataset") -``` - -Congratulations - you've now added a dataset to the OpenAssistant org! diff --git a/openassistant/dev-requirements.txt b/openassistant/dev-requirements.txt new file mode 100644 index 00000000..fe709d0e --- /dev/null +++ b/openassistant/dev-requirements.txt @@ -0,0 +1 @@ +datasets>=2.8,<3.0.0 diff --git a/openassistant/templates/README.md b/openassistant/templates/README.md index b7899eb7..3d4eb28f 100644 --- a/openassistant/templates/README.md +++ b/openassistant/templates/README.md @@ -1,31 +1,11 @@ -# Dataset instructions for {dataset_name} +# Dataset preparation instructions for {dataset_name} ## Setup -```bash -python -m pip install -r requirements.txt -``` +Add any installation details here. -## Download +## Usage -```bash -python download.py -``` +Explain how to run any scripts that involve special downloading (e.g. data is obtained from a web scraper) or data preprocessing. -## Preprocess -```bash -python preprocess.py -``` - -## Load - -```bash -python load.py -``` - -## Upload - -```bash -python upload.py -``` diff --git a/openassistant/templates/dataset_card.md b/openassistant/templates/dataset_card.md new file mode 100644 index 00000000..76736d8f --- /dev/null +++ b/openassistant/templates/dataset_card.md @@ -0,0 +1,28 @@ +--- +license: mit +tags: +- open-assistant +- human-feedback +- dialogue-modeling +- language-modeling +--- + +# Dataset card for {dataset_name} + +This is a dataset card template for the [LAION-AI OpenAssistant project](https://github.com/LAION-AI/Open-Assistant). Fill out this template when adding a new dataset to the Hugging Face Hub. + +## Dataset summary + +[More information needed](https://github.com/huggingface/datasets/blob/main/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards) + +## Usage + +[More information needed](https://github.com/huggingface/datasets/blob/main/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards) + +## Source data + +[More information needed](https://github.com/huggingface/datasets/blob/main/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards) + +## Citation + +[More information needed](https://github.com/huggingface/datasets/blob/main/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards) \ No newline at end of file diff --git a/openassistant/templates/prepare.py b/openassistant/templates/prepare.py new file mode 100644 index 00000000..83a6b15b --- /dev/null +++ b/openassistant/templates/prepare.py @@ -0,0 +1,8 @@ +import typer + +def main(output_dir: str = "data"): + """Download and prepare the dataset for use.""" + raise NotImplementedError + +if __name__ == "__main__": + typer.run(main) \ No newline at end of file diff --git a/openassistant/templates/requirements.txt b/openassistant/templates/requirements.txt deleted file mode 100644 index 74fb5166..00000000 --- a/openassistant/templates/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -datasets>=2.8 -typer From 5b0f6c156737cd291db55ae9144366cbaf604a4e Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Wed, 4 Jan 2023 15:35:19 +1100 Subject: [PATCH 035/121] pre-commit fix --- .pre-commit-config.yaml | 2 +- docs/datasets.md | 162 +++++++++++++----- openassistant/templates/README.md | 5 +- .../{utils/configs.py => templates/hub.py} | 9 +- openassistant/templates/template.py | 7 +- openassistant/utils/__init__.py | 0 openassistant/utils/schemas/__init__.py | 0 .../utils/schemas/language_modeling.py | 12 -- 8 files changed, 131 insertions(+), 66 deletions(-) rename openassistant/{utils/configs.py => templates/hub.py} (69%) delete mode 100644 openassistant/utils/__init__.py delete mode 100644 openassistant/utils/schemas/__init__.py delete mode 100644 openassistant/utils/schemas/language_modeling.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b7345327..7798b093 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,7 +26,7 @@ # # /WARNING! -exclude: "build|stubs|^bot/templates/|openassistant/templates/$" +exclude: "build|stubs|^bot/$|templates/|openassistant/templates" repos: - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/docs/datasets.md b/docs/datasets.md index c5400807..f905d74a 100644 --- a/docs/datasets.md +++ b/docs/datasets.md @@ -1,6 +1,9 @@ # Datasets -The datasets for this project are currently hosted as loading scripts on the [Open-Assistant organization](https://huggingface.co/OpenAssistant) the Hugging Face Hub. Each of them can be loaded by first installing the 🤗 Datasets library: +The datasets for this project are currently hosted as loading scripts on the +[Open-Assistant organization](https://huggingface.co/OpenAssistant) the Hugging +Face Hub. Each of them can be loaded by first installing the 🤗 Datasets +library: ```bash python -m pip install datasets @@ -14,19 +17,31 @@ from datasets import load_dataset dataset = load_dataset("OpenAssistant/{dataset-name}") ``` -We use this GitHub repository to accept new submissions and standardize quality control. See the instructions below if you'd like to contribute a new dataset to the project. +We use this GitHub repository to accept new submissions and standardize quality +control. See the instructions below if you'd like to contribute a new dataset to +the project. ## Adding a new dataset ### 0. Pre-Requisites -Install Git and create a GitHub account prior to implementing a dataset; you can follow instructions to install Git [here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git). +Install Git and create a GitHub account prior to implementing a dataset; you can +follow instructions to install Git +[here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git). -You will also need at least Python 3.8+. If you are installing Python, we recommend downloading [Anaconda](https://docs.anaconda.com/anaconda/install/index.html) to curate a python environment with necessary packages. **We strongly recommend Python 3.8+ for stability**. +You will also need at least Python 3.8+. If you are installing Python, we +recommend downloading +[Anaconda](https://docs.anaconda.com/anaconda/install/index.html) to curate a +python environment with necessary packages. **We strongly recommend Python 3.8+ +for stability**. ### 1. **Fork the OpenAssistant repository** -Fork the `OpenAssistant`[repository](https://github.com/LAION-AI/Open-Assistant). To do this, click the link to the repository and click "Fork" in the upper-right corner. You should get an option to fork to your account, provided you are signed into Github. +Fork the +`OpenAssistant`[repository](https://github.com/LAION-AI/Open-Assistant). To do +this, click the link to the repository and click "Fork" in the upper-right +corner. You should get an option to fork to your account, provided you are +signed into Github. After you fork, clone the repository locally. You can do so as follows: @@ -35,13 +50,15 @@ git clone git@github.com:/OpenAssistant.git cd OpenAssistant # enter the directory ``` -Next, you want to set your `upstream` location to enable you to push/pull (add or receive updates). You can do so as follows: +Next, you want to set your `upstream` location to enable you to push/pull (add +or receive updates). You can do so as follows: ```bash git remote add upstream git@github.com:LAION-AI/Open-Assistant.git ``` -You can optionally check that this was set properly by running the following command: +You can optionally check that this was set properly by running the following +command: ```bash git remote -v @@ -62,14 +79,18 @@ If you do NOT have an `origin` for whatever reason, then run: git remote add origin git@github.com:/OpenAssistant.git ``` -The goal of `upstream` is to keep your repository up-to-date to any changes that are made officially to the OpenAssistant repo. You can do this as follows by running the following commands: +The goal of `upstream` is to keep your repository up-to-date to any changes that +are made officially to the OpenAssistant repo. You can do this as follows by +running the following commands: ``` git fetch upstream git pull ``` -Provided you have no _merge conflicts_, this will ensure the repo stays up-to-date as you make changes. However, before you make changes, you should make a custom branch to implement your changes. +Provided you have no _merge conflicts_, this will ensure the repo stays +up-to-date as you make changes. However, before you make changes, you should +make a custom branch to implement your changes. You can make a new branch as such: @@ -89,14 +110,17 @@ The correct branch will have a asterisk \* in front of it. ### 2. **Create a development environment** -You can make an environment in any way you choose to. We highlight two possible options: +You can make an environment in any way you choose to. We highlight two possible +options: #### 2a) Create a conda environment The following instructions will create an Anaconda `openassistant` environment. -- Install [anaconda](https://docs.anaconda.com/anaconda/install/) for your appropriate operating system. -- Run the following command while in the `biomedical` folder (you can pick your python version): +- Install [anaconda](https://docs.anaconda.com/anaconda/install/) for your + appropriate operating system. +- Run the following command while in the `biomedical` folder (you can pick your + python version): ```bash conda create -n openassistant python=3.8 # Creates a conda env @@ -105,11 +129,13 @@ cd openassistant pip install -r dev-requirements.txt # Install this while in the openassistant folder ``` -You can deactivate your environment at any time by either exiting your terminal or using `conda deactivate`. +You can deactivate your environment at any time by either exiting your terminal +or using `conda deactivate`. #### 2b) Create a venv environment -Python 3.3+ has venv automatically installed; official information is found [here](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/). +Python 3.3+ has venv automatically installed; official information is found +[here](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/). ``` python3 -m venv @@ -128,7 +154,8 @@ Make a new directory within the `openassistant/datasets` directory: mkdir openassistant/datasets/ ``` -**NOTE**: Please use snake_case, i.e. lowercase letters and underscores when choosing a ``. +**NOTE**: Please use snake_case, i.e. lowercase letters and underscores when +choosing a ``. Add an `__init__.py` file to this directory: @@ -136,15 +163,21 @@ Add an `__init__.py` file to this directory: touch openassistant/datasets//__init__.py ``` -Next, copy the `template.py` script of `templates` into your dataset folder. This script has "TODOs" to fill in for your dataloading script: +Next, copy the `template.py` script and `hub.py` module of `templates` into your +dataset folder. The `template.py` script has "TODOs" to fill in for your +dataloading script: ```bash +cp templates/hub.py openassistant/datasets// cp templates/template.py openassistant/datasets//.py ``` #### (Optional) Prepare local dataset files -If your dataset files aren't publicly available via URLs (e.g. because you implemented a web scraper), you'll need to implement some extra logic to store and prepare the data locally prior to implementing a loading script in 🤗 Datasets. +If your dataset files aren't publicly available via URLs (e.g. because you +implemented a web scraper), you'll need to implement some extra logic to store +and prepare the data locally prior to implementing a loading script in 🤗 +Datasets. To do so, first copy the template script for dataset creation: @@ -152,53 +185,67 @@ To do so, first copy the template script for dataset creation: cp templates/prepare.py openassistant/datasets// ``` -Next, implement any logic that is needed to prepare a local version of the dataset files (by convention we store them in `datasets//data/`). Add any extra dependencies to a `requirements.txt` file and provide instructions on how to prepare the dataset files in a README: +Next, implement any logic that is needed to prepare a local version of the +dataset files (by convention we store them in `datasets//data/`). +Add any extra dependencies to a `requirements.txt` file and provide instructions +on how to prepare the dataset files in a README: ```bash touch openassistant/datasets//requirements.txt cp templates/README.py openassistant/datasets// ``` -**Note:** Do not commit any dataset files to the OpenAssistant repo - all data will be hosted on the Hugging Face Hub. This step is needed for the data admins to be able to replicate the dataset creation process before pushing to the Hub. +**Note:** Do not commit any dataset files to the OpenAssistant repo - all data +will be hosted on the Hugging Face Hub. This step is needed for the project's +data admins to be able to replicate the dataset creation process before pushing +to the Hub. ### 4. Implement your dataset -To implement your dataloader, you will need to follow `template.py` and fill in all necessary TODOs. There are three key methods that are important: +To implement your dataloader, you will need to follow `template.py` and fill in +all necessary TODOs. There are three key methods that are important: - `_info`: Specifies the schema of the expected dataloader -- `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split. -- `_generate_examples`: Create examples from data that conform to each schema defined in `_info`. +- `_split_generators`: Downloads and extracts data for each split (e.g. + train/val/test) or associate local data with each split. +- `_generate_examples`: Create examples from data that conform to each schema + defined in `_info`. For the `_info_` function, you will need to define `features` for your -`DatasetInfo` object. For each dataset config, choose the right schema from our list of examples. You can find the schemas in the [schemas directory](openassistant/utils/schemas/). +`DatasetInfo` object. For each dataset config, choose the right schema from our +list of examples. You can find the schemas in the +[schemas directory](openassistant/utils/schemas/). You will use this schema in the `_generate_examples` return value. -Populate the information in the dataset according to this schema; some fields may be empty. +Populate the information in the dataset according to this schema; some fields +may be empty. #### Example scripts TODO -#### Running & Debugging +#### Running & debugging -You can run your data loader script during development by appending the following -statement to your code ([templates/template.py](templates/template.py) already includes this): +You can run your data loader script during development by appending the +following statement to your code ([templates/template.py](templates/template.py) +already includes this): ```python if __name__ == "__main__": datasets.load_dataset(__file__) ``` -If you want to use an interactive debugger during development, you will have to use -`breakpoint()` instead of setting breakpoints directly in your IDE. Most IDEs will -recognize the `breakpoint()` statement and pause there during debugging. If your preferred -IDE doesn't support this, you can always run the script in your terminal and debug with -`pdb`. +If you want to use an interactive debugger during development, you will have to +use `breakpoint()` instead of setting breakpoints directly in your IDE. Most +IDEs will recognize the `breakpoint()` statement and pause there during +debugging. If your preferred IDE doesn't support this, you can always run the +script in your terminal and debug with `pdb`. ### 5. Check if your dataloader works -Make sure your dataset is implemented correctly by checking in python the following commands: +Make sure your dataset is implemented correctly by checking in python the +following commands: ```python from datasets import load_dataset @@ -224,7 +271,8 @@ From the main directory, run the code quality checks via the following command: pre-commit run --all-files ``` -This runs the black formatter, isort, and lints to ensure that the code is readable and looks nice. Flake8 linting errors may require manual changes. +This runs the black formatter, isort, and lints to ensure that the code is +readable and looks nice. Flake8 linting errors may require manual changes. ### 8. Commit your changes @@ -235,7 +283,8 @@ git add openassistant/datasets//*.py git commit -m "A message describing your commits" ``` -Then, run the following commands to incorporate any new changes in the master branch of datasets as follows: +Then, run the following commands to incorporate any new changes in the master +branch of datasets as follows: ``` git fetch upstream @@ -252,13 +301,20 @@ git push -u origin ### 9. **Make a pull request** -Make a Pull Request to implement your changes on the main repository [here](https://github.com/LAION-AI/Open-Assistant/pulls). To do so, click "New Pull Request". Then, choose your branch from your fork to push into "base:main". +Make a Pull Request to implement your changes on the main repository +[here](https://github.com/LAION-AI/Open-Assistant/pulls). To do so, click "New +Pull Request". Then, choose your branch from your fork to push into "base:main". -When opening a PR, please link the [issue](https://github.com/LAION-AI/Open-Assistant/issues) corresponding to your dataset using [closing keywords](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue) in the PR's description, e.g. `resolves #17`. +When opening a PR, please link the +[issue](https://github.com/LAION-AI/Open-Assistant/issues) corresponding to your +dataset using +[closing keywords](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue) +in the PR's description, e.g. `resolves #17`. ## [Admins] Uploading a dataset to the Hugging Face Hub -Uploading a new dataset from `openassistant/datasets/` to the Hugging Face Hub typically involves the following steps: +Uploading a new dataset from `openassistant/datasets/` to the +Hugging Face Hub typically involves the following steps: 1. Setup 2. Create a new dataset repository @@ -270,9 +326,15 @@ Uploading a new dataset from `openassistant/datasets/` to the Hugg To upload a dataset to the OpenAssistant organization, you first need to: - Create a [Hugging Face account](https://huggingface.co/join) (it's free) -- Join the [OpenAssistant organization](https://huggingface.co/OpenAssistant) by clicking on the _Request to join this org_ button on the top right-hand side +- Join the [OpenAssistant organization](https://huggingface.co/OpenAssistant) by + clicking on the _Request to join this org_ button on the top right-hand side -Next, check that you're correctly logged in and that `git-lfs` is installed so that the dataset can be uploaded. To log in, create a **write access token** that can be found under your Hugging Face profile (icon in the top right corner on [hf.co](http://hf.co/), then Settings -> Access Tokens -> User Access Tokens -> New Token. Alternatively, you can go to [your token settings](https://huggingface.co/settings/tokens) directly. +Next, check that you're correctly logged in and that `git-lfs` is installed so +that the dataset can be uploaded. To log in, create a **write access token** +that can be found under your Hugging Face profile (icon in the top right corner +on [hf.co](http://hf.co/), then Settings -> Access Tokens -> User Access Tokens +-> New Token. Alternatively, you can go to +[your token settings](https://huggingface.co/settings/tokens) directly. Once you've created a token, run: @@ -290,13 +352,17 @@ notebook_login() You can then copy-paste your token to log in locally. -Next, let's make sure that `git-lfs` is correctly installed. To do so, simply run: +Next, let's make sure that `git-lfs` is correctly installed. To do so, simply +run: ```bash git-lfs -v ``` -The output should show something like `git-lfs/2.13.2 (GitHub; linux amd64; go 1.15.4)`. If your console states that the `git-lfs` command was not found, please make sure to install it [here](https://git-lfs.github.com/) or simply via: +The output should show something like +`git-lfs/2.13.2 (GitHub; linux amd64; go 1.15.4)`. If your console states that +the `git-lfs` command was not found, please make sure to install it +[here](https://git-lfs.github.com/) or simply via: ```bash sudo apt-get install git-lfs @@ -312,7 +378,9 @@ python -m pip install datasets ### 2. Create a new dataset repository -Follow [this guide](https://huggingface.co/docs/datasets/upload_dataset) for instructions on creating a new dataset repo on the Hub. Use the same snake_case name as the dataset in `openassistant/datasets/`. +Follow [this guide](https://huggingface.co/docs/datasets/upload_dataset) for +instructions on creating a new dataset repo on the Hub. Use the same snake_case +name as the dataset in `openassistant/datasets/`. Once you've created the dataset repo, clone it by running: @@ -332,11 +400,15 @@ cp openassistant/datasets//README.md . #### (Optional) Prepare local dataset files -If the dataset files of `openassistant/datasets/` aren't public, you'll need to run the `openassistant/datasets//prepare.py` script to create them. Store them in the same directory that is specified by the loading script (`data` by default). +If the dataset files of `openassistant/datasets/` aren't public, +you'll need to run the `openassistant/datasets//prepare.py` script +to create them. Store them in the same directory that is specified by the +loading script (`data` by default). ### 4. Upload to the Hub -Once the dataset script and card are ready, use Git to push them to the Hub (along with any data files you may need). +Once the dataset script and card are ready, use Git to push them to the Hub +(along with any data files you may need). At this point, you can load the dataset by running: diff --git a/openassistant/templates/README.md b/openassistant/templates/README.md index 3d4eb28f..b174c47e 100644 --- a/openassistant/templates/README.md +++ b/openassistant/templates/README.md @@ -6,6 +6,5 @@ Add any installation details here. ## Usage -Explain how to run any scripts that involve special downloading (e.g. data is obtained from a web scraper) or data preprocessing. - - +Explain how to run any scripts that involve preparing local dataset files, e.g. +if the dataset files aren't public or are produced by a web scraper. diff --git a/openassistant/utils/configs.py b/openassistant/templates/hub.py similarity index 69% rename from openassistant/utils/configs.py rename to openassistant/templates/hub.py index 2391f66b..49194e31 100644 --- a/openassistant/utils/configs.py +++ b/openassistant/templates/hub.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- from dataclasses import dataclass import datasets @@ -13,3 +12,11 @@ class OpenAssistantConfig(datasets.BuilderConfig): description: str = None schema: str = None subset_id: str = None + + +lm_features = datasets.Features( + { + "text": datasets.Value("string"), + "meta": [datasets.Value("string")], + } +) diff --git a/openassistant/templates/template.py b/openassistant/templates/template.py index b6c52871..391df55f 100644 --- a/openassistant/templates/template.py +++ b/openassistant/templates/template.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # Copyright 2023 The OpenAssistant Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -37,10 +36,10 @@ from typing import Dict, List, Tuple import datasets -from openassistant.utils.configs import OpenAssistantConfig +from .hub import OpenAssistantConfig -# TODO: import the schema that fits your dataset: -from openassistant.utils.schemas import +# TODO: import the schema (i.e. features) that fits your dataset: +from .hub import # TODO: Add BibTeX citation where appropriate _CITATION = """\ diff --git a/openassistant/utils/__init__.py b/openassistant/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/openassistant/utils/schemas/__init__.py b/openassistant/utils/schemas/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/openassistant/utils/schemas/language_modeling.py b/openassistant/utils/schemas/language_modeling.py deleted file mode 100644 index 0b2072a5..00000000 --- a/openassistant/utils/schemas/language_modeling.py +++ /dev/null @@ -1,12 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Language Modeling Schema -""" -import datasets - -features = datasets.Features( - { - "text": datasets.Value("string"), - "meta": [datasets.Value("string")], - } -) From 8eab50b1c15816c46bebb2a98c33214ba65c8e9d Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Wed, 4 Jan 2023 15:36:10 +1100 Subject: [PATCH 036/121] pre-commit fix --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7798b093..beb74e79 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,7 +26,7 @@ # # /WARNING! -exclude: "build|stubs|^bot/$|templates/|openassistant/templates" +exclude: "build|stubs|^bot/templates/$|templates/|openassistant/templates" repos: - repo: https://github.com/pre-commit/pre-commit-hooks From 42e8440f2c680f5defa5f0c513eaccf47149d252 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Wed, 4 Jan 2023 15:36:32 +1100 Subject: [PATCH 037/121] pre-commit fix --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index beb74e79..ea445666 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,7 +26,7 @@ # # /WARNING! -exclude: "build|stubs|^bot/templates/$|templates/|openassistant/templates" +exclude: build|stubs|^bot/templates/$|templates/|openassistant/templates repos: - repo: https://github.com/pre-commit/pre-commit-hooks From 6950fff78128d94ee27ac4329c9b678820487c31 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Wed, 4 Jan 2023 15:41:52 +1100 Subject: [PATCH 038/121] pre-commit fix --- .pre-commit-config.yaml | 2 +- docs/datasets.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ea445666..0bb81e89 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,7 +26,7 @@ # # /WARNING! -exclude: build|stubs|^bot/templates/$|templates/|openassistant/templates +exclude: build|stubs|^bot/templates/$|openassistant/templates repos: - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/docs/datasets.md b/docs/datasets.md index f905d74a..ab039871 100644 --- a/docs/datasets.md +++ b/docs/datasets.md @@ -1,4 +1,4 @@ -# Datasets +# OpenAssistant Datasets The datasets for this project are currently hosted as loading scripts on the [Open-Assistant organization](https://huggingface.co/OpenAssistant) the Hugging From ba2c4cbc0fca690ab7a4828e33bdfd02431f4d19 Mon Sep 17 00:00:00 2001 From: rsandb Date: Tue, 3 Jan 2023 22:43:21 -0600 Subject: [PATCH 039/121] Dashboard --- node_modules/.package-lock.json | 6 + website/src/components/Container.tsx | 2 +- website/src/components/Header/Header.tsx | 90 +------------ website/src/components/Header/NavLinks.tsx | 9 +- website/src/components/Header/UserMenu.tsx | 85 ++++++++----- website/src/components/Sortable/Sortable.tsx | 2 +- .../components/Widgets/LeaderboardWidget.tsx | 94 ++++++++++++++ website/src/components/Widgets/SideMenu.tsx | 83 ++++++++++++ website/src/components/Widgets/TaskOption.tsx | 120 ++++++++++++++++++ website/src/components/Widgets/index.ts | 3 + website/src/middleware.ts | 2 +- website/src/pages/account/edit.tsx | 1 - website/src/pages/auth/signin.tsx | 7 +- website/src/pages/dashboard.tsx | 37 ++++++ .../src/pages/evaluate/rank_user_replies.tsx | 4 +- website/src/pages/index.tsx | 20 +-- website/styles/Theme/Components/Container.tsx | 25 ++++ website/styles/Theme/colors.tsx | 14 ++ website/styles/Theme/index.tsx | 68 ++++++++++ 19 files changed, 526 insertions(+), 146 deletions(-) create mode 100644 node_modules/.package-lock.json create mode 100644 website/src/components/Widgets/LeaderboardWidget.tsx create mode 100644 website/src/components/Widgets/SideMenu.tsx create mode 100644 website/src/components/Widgets/TaskOption.tsx create mode 100644 website/src/components/Widgets/index.ts create mode 100644 website/src/pages/dashboard.tsx create mode 100644 website/styles/Theme/Components/Container.tsx create mode 100644 website/styles/Theme/colors.tsx create mode 100644 website/styles/Theme/index.tsx diff --git a/node_modules/.package-lock.json b/node_modules/.package-lock.json new file mode 100644 index 00000000..cfd645f0 --- /dev/null +++ b/node_modules/.package-lock.json @@ -0,0 +1,6 @@ +{ + "name": "Open-Assistant", + "lockfileVersion": 2, + "requires": true, + "packages": {} +} diff --git a/website/src/components/Container.tsx b/website/src/components/Container.tsx index f65ed282..4149c0da 100644 --- a/website/src/components/Container.tsx +++ b/website/src/components/Container.tsx @@ -1,5 +1,5 @@ import clsx from "clsx"; export function Container({ className, ...props }) { - return
; + return
; } diff --git a/website/src/components/Header/Header.tsx b/website/src/components/Header/Header.tsx index ddc369ea..ed41936f 100644 --- a/website/src/components/Header/Header.tsx +++ b/website/src/components/Header/Header.tsx @@ -1,45 +1,11 @@ -import { Box, Button, useColorMode } from "@chakra-ui/react"; -import { Popover } from "@headlessui/react"; -import { AnimatePresence, motion } from "framer-motion"; +import { Box, Button, Text, useColorMode } from "@chakra-ui/react"; import Image from "next/image"; import Link from "next/link"; import { useSession } from "next-auth/react"; import { FaUser } from "react-icons/fa"; -import { ColorModeIconToggle } from "../UI/ColorModeIconToggle"; import { UserMenu } from "./UserMenu"; -function MenuIcon(props) { - const { colorMode } = useColorMode(); - const stroke = colorMode === "light" ? "black" : "white"; - return ( - - ); -} - -function ChevronUpIcon(props) { - return ( - - ); -} - -function MobileNavLink({ children, ...props }) { - return ( - - {children} - - ); -} - function AccountButton() { const { data: session } = useSession(); if (session) { @@ -61,64 +27,20 @@ export function Header(props) { : colorMode === "light" ? "border-b border-gray-400" : "border-b border-zinc-800"; - return ( diff --git a/website/src/components/Header/NavLinks.tsx b/website/src/components/Header/NavLinks.tsx index 4f559e7e..47bd90be 100644 --- a/website/src/components/Header/NavLinks.tsx +++ b/website/src/components/Header/NavLinks.tsx @@ -1,7 +1,8 @@ -import { useColorMode } from "@chakra-ui/react"; +import { Text, useColorMode } from "@chakra-ui/react"; import { AnimatePresence, motion } from "framer-motion"; import Link from "next/link"; import { useState } from "react"; +import { colors } from "styles/Theme/colors"; export function NavLinks(): JSX.Element { const [hoveredIndex, setHoveredIndex] = useState(null); @@ -14,8 +15,8 @@ export function NavLinks(): JSX.Element { return ( <> {[ - ["Join Us", "/#join-us"], ["FAQ", "/#faq"], + ["Join Us", "/#join-us"], ].map(([label, href], index) => ( )} - {label} + + {label} + ))} diff --git a/website/src/components/Header/UserMenu.tsx b/website/src/components/Header/UserMenu.tsx index 35b71698..3e0e5dee 100644 --- a/website/src/components/Header/UserMenu.tsx +++ b/website/src/components/Header/UserMenu.tsx @@ -1,14 +1,15 @@ -import { Box, useColorModeValue } from "@chakra-ui/react"; +import { Box, Link, Text, useColorModeValue } from "@chakra-ui/react"; import { Popover } from "@headlessui/react"; import { AnimatePresence, motion } from "framer-motion"; import Image from "next/image"; import { signOut, useSession } from "next-auth/react"; import React from "react"; -import { FaCog, FaSignOutAlt } from "react-icons/fa"; +import { FiLayout, FiLogOut } from "react-icons/fi"; export function UserMenu() { const { data: session } = useSession(); - const backgroundColor = useColorModeValue("#FFFFFF", "#000000"); + const backgroundColor = useColorModeValue("white", "gray.700"); + const accentColor = useColorModeValue("gray.300", "gray.600"); if (!session) { return <>; @@ -16,10 +17,10 @@ export function UserMenu() { if (session && session.user) { const accountOptions = [ { - name: "Account Settings", - href: "/account", - desc: "Account Settings", - icon: FaCog, + name: "Dashboard", + href: "/dashboard", + desc: "Dashboard", + icon: FiLayout, //For future use }, ]; @@ -28,18 +29,22 @@ export function UserMenu() { {({ open }) => ( <> -
+ Profile Picture

{session.user.name || session.user.email}

-
+
{open && ( @@ -54,35 +59,45 @@ export function UserMenu() { y: -10, transition: { duration: 0.2 }, }} - className="absolute right-0 mt-3 w-screen bg-inherit max-w-xs p-4 rounded-md border border-slate-300/70" > - - {accountOptions.map((item) => ( - + + {accountOptions.map((item) => ( + +
+
+
+ {item.name} +
+ + ))} + signOut({ callbackUrl: "/" })} >
-
-

{item.name}

+ Sign Out
-
- ))} - signOut({ callbackUrl: "/" })} - > -
- -
-
-

Sign Out

-
-
+ +
diff --git a/website/src/components/Sortable/Sortable.tsx b/website/src/components/Sortable/Sortable.tsx index 269c39a4..270c67e7 100644 --- a/website/src/components/Sortable/Sortable.tsx +++ b/website/src/components/Sortable/Sortable.tsx @@ -17,8 +17,8 @@ import { verticalListSortingStrategy, } from "@dnd-kit/sortable"; import { ReactNode, useEffect, useState } from "react"; -import { CollapsableText } from "../CollapsableText"; +import { CollapsableText } from "../CollapsableText"; import { SortableItem } from "./SortableItem"; export interface SortableProps { diff --git a/website/src/components/Widgets/LeaderboardWidget.tsx b/website/src/components/Widgets/LeaderboardWidget.tsx new file mode 100644 index 00000000..5f680179 --- /dev/null +++ b/website/src/components/Widgets/LeaderboardWidget.tsx @@ -0,0 +1,94 @@ +import { Badge, Box, Image, Link, Stack, StackDivider, Text, useColorModeValue } from "@chakra-ui/react"; + +export function LeaderboardWidget() { + const backgroundColor = useColorModeValue("white", "gray.700"); + const accentColor = useColorModeValue("gray.200", "gray.900"); + + //need to add streak info to chart + + const leaderInfo = [ + { + name: "fozziethebeat#6690", + image: "/images/temp-avatars/av1.jpg", + score: "5,208", + arrowDir: "increase", + streak: false, + streakCount: "5-Day Streak", + }, + { + name: "k_nearest_neighbor#8579", + image: "/images/temp-avatars/av2.jpg", + score: "5,164", + arrowDir: "decrease", + streak: false, + streakCount: "", + }, + { + name: "andreaskoepf#2266", + image: "/images/temp-avatars/av3.jpg", + score: "5,120", + arrowDir: "", + streak: false, + streakCount: "2-Day Streak", + }, + { + name: "AbdBarho#1684", + image: "/images/temp-avatars/av4.jpg", + score: "4,260", + arrowDir: "", + streak: false, + streakCount: "", + }, + { + name: "zu#9016", + image: "/images/temp-avatars/av5.jpg", + score: "3,608", + arrowDir: "", + streak: false, + streakCount: "", + }, + ]; + + return ( +
+
+
+ Top 5 Contributors + + + View All -> + + +
+ + } spacing="4"> +
+

Name

+
+

Score

+
+
+ {leaderInfo.map((item) => ( +
+
+ Profile Picture +

{item.name}

+ {item.streakCount} +
+ +

{item.score}

+
+
+ ))} +
+
+
+
+ ); +} diff --git a/website/src/components/Widgets/SideMenu.tsx b/website/src/components/Widgets/SideMenu.tsx new file mode 100644 index 00000000..8b21b71f --- /dev/null +++ b/website/src/components/Widgets/SideMenu.tsx @@ -0,0 +1,83 @@ +import { Box, Button, Link, Text, Tooltip, useColorMode } from "@chakra-ui/react"; +import { useRouter } from "next/router"; +import { FiLayout, FiSun } from "react-icons/fi"; +import { colors } from "styles/Theme/colors"; + +export function SideMenu() { + const router = useRouter(); + const { colorMode, toggleColorMode } = useColorMode(); + const buttonOptions = [ + { + label: "Dashboard", + pathname: "/dashboard", + desc: "Dashboard Home", + icon: FiLayout, + }, + // { + // label: "Leaderboard", + // pathname: "#", + // desc: "Public Leaderboard", + // icon: FiAward, + // }, + // { + // label: "Stats", + // pathname: "#", + // desc: "User Statistics", + // icon: FiBarChart2, + // }, + ]; + + return ( +
+ + +
+ + + +
+
+
+ ); +} diff --git a/website/src/components/Widgets/TaskOption.tsx b/website/src/components/Widgets/TaskOption.tsx new file mode 100644 index 00000000..f807f391 --- /dev/null +++ b/website/src/components/Widgets/TaskOption.tsx @@ -0,0 +1,120 @@ +import { Box, Flex, GridItem, Heading, SimpleGrid, Text, useColorModeValue } from "@chakra-ui/react"; +import Link from "next/link"; + +const crTasks = [ + { + label: "Reply as User", + desc: "Chat with Open Assistant and help improve it’s responses as you interact with it.", + type: "create", + pathname: "/create/assistant_reply", + }, + { + label: "Reply as Assistant", + desc: "Help Open Assistant improve its responses to conversations with other users.", + type: "create", + pathname: "/create/assistant_reply", + }, +]; + +const evTasks = [ + { + label: "Rank User Replies", + type: "eval", + desc: "Help Open Assistant improve its responses to conversations with other users.", + pathname: "/evaluate/rank_user_replies", + }, + + { + label: "Rank Assistant Replies", + desc: "Score prompts given by Open Assistant based on their accuracy and readability.", + type: "eval", + pathname: "/evaluate/rank_assistant_replies", + }, + { + label: "Rank Initial Prompts", + desc: "Score prompts given by Open Assistant based on their accuracy and readability.", + type: "eval;", + pathname: "/evaluate/rank_initial_prompts", + }, +]; + +export const TaskOption = () => { + const backgroundColor = useColorModeValue("white", "gray.700"); + + return ( + +
+ Create + + {crTasks.map((item) => ( + + + + + + {item.label} + + + {item.desc} + + + + + + Go + + + + + ))} + +
+
+ Evaluate + + {evTasks.map((item) => ( + + + + + + {item.label} + + + {item.desc} + + + + + + Go + + + + + ))} + +
+
+ ); +}; diff --git a/website/src/components/Widgets/index.ts b/website/src/components/Widgets/index.ts new file mode 100644 index 00000000..48a40e74 --- /dev/null +++ b/website/src/components/Widgets/index.ts @@ -0,0 +1,3 @@ +export { LeaderboardWidget } from "./LeaderboardWidget"; +export { SideMenu } from "./SideMenu"; +export { TaskOption } from "./TaskOption"; diff --git a/website/src/middleware.ts b/website/src/middleware.ts index 8e48d3b7..b6a539b4 100644 --- a/website/src/middleware.ts +++ b/website/src/middleware.ts @@ -4,5 +4,5 @@ export { default } from "next-auth/middleware"; * Guards all pages under `/grading` and redirects them to the sign in page. */ export const config = { - matcher: ["/create/:path*", "/evaluate/:path*", "/account/:path*"], + matcher: ["/create/:path*", "/evaluate/:path*", "/account/:path*", "/dashboard"], }; diff --git a/website/src/pages/account/edit.tsx b/website/src/pages/account/edit.tsx index 322f4113..f695fce7 100644 --- a/website/src/pages/account/edit.tsx +++ b/website/src/pages/account/edit.tsx @@ -51,7 +51,6 @@ export default function Account() { -

{session.user.email}

); diff --git a/website/src/pages/auth/signin.tsx b/website/src/pages/auth/signin.tsx index 221eb1f0..59fc7c05 100644 --- a/website/src/pages/auth/signin.tsx +++ b/website/src/pages/auth/signin.tsx @@ -15,13 +15,13 @@ function Signin({ csrfToken, providers }) { const emailEl = useRef(null); const signinWithEmail = (ev: React.FormEvent) => { ev.preventDefault(); - signIn(email.id, { callbackUrl: "/", email: emailEl.current.value }); + signIn(email.id, { callbackUrl: "/dashboard", email: emailEl.current.value }); }; const debugUsernameEl = useRef(null); function signinWithDebugCredentials(ev: React.FormEvent) { ev.preventDefault(); - signIn(credentials.id, { callbackUrl: "/", username: debugUsernameEl.current.value }); + signIn(credentials.id, { callbackUrl: "/dashboard", username: debugUsernameEl.current.value }); } const { colorMode } = useColorMode(); @@ -98,7 +98,8 @@ function Signin({ csrfToken, providers }) { )} -
+
+
By signing up you agree to our

Terms of Service diff --git a/website/src/pages/dashboard.tsx b/website/src/pages/dashboard.tsx new file mode 100644 index 00000000..018227af --- /dev/null +++ b/website/src/pages/dashboard.tsx @@ -0,0 +1,37 @@ +import { Box, useColorMode } from "@chakra-ui/react"; +import Head from "next/head"; +import { Header } from "src/components/Header"; +import { LeaderboardWidget, SideMenu, TaskOption } from "src/components/Widgets"; +import { colors } from "styles/Theme/colors"; + +const Dashboard = () => { + const { colorMode } = useColorMode(); + return ( + <> + + Dashboard - Open Assistant + + + + + + + + + + + + + + + ); +}; + +Dashboard.getLayout = (page) => ( +
+
+ {page} +
+); + +export default Dashboard; diff --git a/website/src/pages/evaluate/rank_user_replies.tsx b/website/src/pages/evaluate/rank_user_replies.tsx index 250b17fc..3f806a8a 100644 --- a/website/src/pages/evaluate/rank_user_replies.tsx +++ b/website/src/pages/evaluate/rank_user_replies.tsx @@ -1,7 +1,9 @@ import { useColorMode } from "@chakra-ui/react"; import Head from "next/head"; import { useState } from "react"; +import { ContextMessages } from "src/components/ContextMessages"; import { LoadingScreen } from "src/components/Loading/LoadingScreen"; +import { Message } from "src/components/Messages"; import { Sortable } from "src/components/Sortable/Sortable"; import { SurveyCard } from "src/components/Survey/SurveyCard"; import { TaskControls } from "src/components/Survey/TaskControls"; @@ -9,8 +11,6 @@ import fetcher from "src/lib/fetcher"; import poster from "src/lib/poster"; import useSWRImmutable from "swr/immutable"; import useSWRMutation from "swr/mutation"; -import { Message } from "src/components/Messages"; -import { ContextMessages } from "src/components/ContextMessages"; const RankUserReplies = () => { const [tasks, setTasks] = useState([]); diff --git a/website/src/pages/index.tsx b/website/src/pages/index.tsx index 08528245..04f99829 100644 --- a/website/src/pages/index.tsx +++ b/website/src/pages/index.tsx @@ -1,16 +1,10 @@ import Head from "next/head"; -import { useSession } from "next-auth/react"; import { CallToAction } from "src/components/CallToAction"; import { Faq } from "src/components/Faq"; -import { Footer } from "src/components/Footer"; -import { Header } from "src/components/Header"; import { Hero } from "src/components/Hero"; import { getTransparentHeaderLayout } from "src/components/Layout"; -import { TaskSelection } from "src/components/TaskSelection"; const Home = () => { - const { data: session } = useSession(); - return ( <> @@ -20,15 +14,11 @@ const Home = () => { content="Conversational AI for everyone. An open source project to create a chat enabled GPT LLM run by LAION and contributors around the world." /> - {session ? ( - - ) : ( -
- - - -
- )} +
+ + + +
); }; diff --git a/website/styles/Theme/Components/Container.tsx b/website/styles/Theme/Components/Container.tsx new file mode 100644 index 00000000..ec7eeed9 --- /dev/null +++ b/website/styles/Theme/Components/Container.tsx @@ -0,0 +1,25 @@ +import { + color, + defineStyle, + defineStyleConfig, + // transition, +} from "@chakra-ui/styled-system"; +import { colors } from "../colors"; + +const baseStyle = defineStyle(({ colorMode }) => ({ + minWidth: "100%", + bg: colorMode === "light" ? colors.light.bg : colors.dark.bg, + // transition: "background-color 300ms cubic-bezier(0.4, 0, 1, 1)", + color: colorMode === "light" ? colors.light.text : colors.dark.text, +})); + +const variants = { + "no-padding": { + padding: 0, + }, +}; + +export const containerTheme = defineStyleConfig({ + baseStyle, + variants, +}); diff --git a/website/styles/Theme/colors.tsx b/website/styles/Theme/colors.tsx new file mode 100644 index 00000000..acadfa2b --- /dev/null +++ b/website/styles/Theme/colors.tsx @@ -0,0 +1,14 @@ +export const colors = { + light: { + bg: "gray.100", + btn: "gray.50", + div: "white", + text: "black", + }, + dark: { + bg: "gray.900", + btn: "gray.600", + div: "gray.700", + text: "gray.200", + }, +}; diff --git a/website/styles/Theme/index.tsx b/website/styles/Theme/index.tsx new file mode 100644 index 00000000..9e9b0d82 --- /dev/null +++ b/website/styles/Theme/index.tsx @@ -0,0 +1,68 @@ +import { + type ThemeConfig, + extendTheme, + usePrefersReducedMotion, +} from "@chakra-ui/react"; +import { containerTheme } from "./Components/Container"; +import { StyleFunctionProps, Styles } from "@chakra-ui/theme-tools"; + +const config: ThemeConfig = { + initialColorMode: "system", + useSystemColorMode: false, + disableTransitionOnChange: true, +}; + +const components = { + Container: containerTheme, + Box: (props: StyleFunctionProps) => ({ + backgroundColor: props.colorMode === "light" ? "white" : "gray.800", + }), + Button: { + baseStyle: { + fontWeight: "normal", + }, + sizes: { + lg: { + fontSize: "md", + paddingY: "7", + }, + }, + variants: { + solid: (props: StyleFunctionProps) => ({ + bg: props.colorMode === "light" ? "gray.100" : "gray.600", + _hover: { + bg: props.colorMode === "light" ? "gray.200" : "#3D4A60", + }, + _active: { + bg: props.colorMode === "light" ? "gray.300" : "#374254", + }, + borderRadius: "lg", + }), + // gradient: (props: StyleFunctionProps) => ({ + // bg: `linear-gradient(${white}, ${bgColor}) padding-box, + // linear-gradient(135deg, ${lgFrom}, ${lgTo}) border-box`, + // }), + }, + }, +}; + +const breakpoints = { + sm: "640px", + md: "768px", + lg: "1024px", + xl: "1280px", + "2xl": "1536px", +}; + +const styles = { + global: (props) => ({ + main: { + fontFamily: "Inter", + }, + header: { + fontFamily: "Inter", + }, + }), +}; + +export const theme = extendTheme({ config, styles, components, breakpoints }); From afe2afc169b5dc15880e1c45d17ffc2d44741881 Mon Sep 17 00:00:00 2001 From: rsandb Date: Tue, 3 Jan 2023 22:55:44 -0600 Subject: [PATCH 040/121] Dashboard --- website/src/components/Header/UserMenu.tsx | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/website/src/components/Header/UserMenu.tsx b/website/src/components/Header/UserMenu.tsx index 3e0e5dee..66fe0405 100644 --- a/website/src/components/Header/UserMenu.tsx +++ b/website/src/components/Header/UserMenu.tsx @@ -4,7 +4,7 @@ import { AnimatePresence, motion } from "framer-motion"; import Image from "next/image"; import { signOut, useSession } from "next-auth/react"; import React from "react"; -import { FiLayout, FiLogOut } from "react-icons/fi"; +import { FiLayout, FiLogOut, FiSettings } from "react-icons/fi"; export function UserMenu() { const { data: session } = useSession(); @@ -23,6 +23,13 @@ export function UserMenu() { icon: FiLayout, //For future use }, + { + name: "Account Settings", + href: "/account", + desc: "Account Settings", + icon: FiSettings, + //For future use + }, ]; return ( From 38d5e25c60c6f0dd1bc3486fb6a112758a66ffd8 Mon Sep 17 00:00:00 2001 From: rsandb Date: Tue, 3 Jan 2023 23:11:16 -0600 Subject: [PATCH 041/121] fixed breakpoint issue on mobile. last commit --- website/src/styles/Theme/index.ts | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/website/src/styles/Theme/index.ts b/website/src/styles/Theme/index.ts index 11edd8f3..0c43eddd 100644 --- a/website/src/styles/Theme/index.ts +++ b/website/src/styles/Theme/index.ts @@ -14,6 +14,14 @@ const components = { Container: containerTheme, }; +const breakpoints = { + sm: "640px", + md: "768px", + lg: "1024px", + xl: "1280px", + "2xl": "1536px", +}; + const styles: Styles = { global: (props) => ({ "*": { @@ -34,4 +42,4 @@ const styles: Styles = { }), }; -export const theme = extendTheme({ colors, config, styles, components }); +export const theme = extendTheme({ colors, config, styles, components, breakpoints }); From a293675a3c1e215465688b37fc09ff24c785727e Mon Sep 17 00:00:00 2001 From: rsandb Date: Wed, 4 Jan 2023 00:03:00 -0600 Subject: [PATCH 042/121] Delete .package-lock.json --- node_modules/.package-lock.json | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 node_modules/.package-lock.json diff --git a/node_modules/.package-lock.json b/node_modules/.package-lock.json deleted file mode 100644 index cfd645f0..00000000 --- a/node_modules/.package-lock.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "name": "Open-Assistant", - "lockfileVersion": 2, - "requires": true, - "packages": {} -} From 065e9273bc4866c04c3edd4015d6aefc03d739ae Mon Sep 17 00:00:00 2001 From: Alex Ott <66271487+AlexanderHOtt@users.noreply.github.com> Date: Tue, 3 Jan 2023 23:14:05 -0800 Subject: [PATCH 043/121] update a few things (#357) --- discord-bot/bot/extensions/work.py | 6 +++--- discord-bot/bot/messages.py | 21 +++++++++++---------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/discord-bot/bot/extensions/work.py b/discord-bot/bot/extensions/work.py index 6b7f8ea4..0561039d 100644 --- a/discord-bot/bot/extensions/work.py +++ b/discord-bot/bot/extensions/work.py @@ -29,8 +29,8 @@ from oasst_shared.schemas.protocol import TaskRequestType plugin = lightbulb.Plugin("WorkPlugin") -MAX_TASK_TIME = 60 * 60 # 1 hour -MAX_TASK_ACCEPT_TIME = 60 # 1 minute +MAX_TASK_TIME = 60 * 60 # seconds +MAX_TASK_ACCEPT_TIME = 60 * 10 # seconds settings = Settings() @@ -117,7 +117,7 @@ async def _handle_task(ctx: lightbulb.Context, task_type: TaskRequestType) -> No # Task action loop completed = False while not completed: - await ctx.author.send(embed=plain_embed("Please type your response here")) + await ctx.author.send(embed=plain_embed("Please type your response below:")) try: event = await ctx.bot.wait_for( hikari.DMMessageCreateEvent, diff --git a/discord-bot/bot/messages.py b/discord-bot/bot/messages.py index 0f29511a..f74f8720 100644 --- a/discord-bot/bot/messages.py +++ b/discord-bot/bot/messages.py @@ -80,6 +80,7 @@ def initial_prompt_message(task: protocol_schema.InitialPromptTask) -> str: {_h1("INITIAL PROMPT")} + {_writing_prompt("Please provide an initial prompt to the assistant.")} {_hint(task.hint)} """ @@ -91,10 +92,10 @@ def rank_initial_prompts_message(task: protocol_schema.RankInitialPromptsTask) - {_h1("RANK INITIAL PROMPTS")} -{_ranking_prompt("Reply with the numbers of best to worst prompts separated by commas (example: '4,1,3,2')")} - {_ordered_list(task.prompts)} + +{_ranking_prompt("Reply with the numbers of best to worst prompts separated by commas (example: '4,1,3,2')")} """ @@ -104,12 +105,12 @@ def rank_prompter_reply_message(task: protocol_schema.RankPrompterRepliesTask) - {_h1("RANK PROMPTER REPLIES")} -{_ranking_prompt("Reply with the numbers of best to worst replies separated by commas (example: '4,1,3,2')")} - {_conversation(task.conversation)} {_user(None)} {_ordered_list(task.replies)} + +{_ranking_prompt("Reply with the numbers of best to worst replies separated by commas (example: '4,1,3,2')")} """ @@ -119,12 +120,12 @@ def rank_assistant_reply_message(task: protocol_schema.RankAssistantRepliesTask) {_h1("RANK ASSISTANT REPLIES")} -{_ranking_prompt("Reply with the numbers of best to worst replies separated by commas (example: '4,1,3,2')")} - {_conversation(task.conversation)} {_assistant(None)} {_ordered_list(task.replies)} + +{_ranking_prompt("Reply with the numbers of best to worst replies separated by commas (example: '4,1,3,2')")} """ @@ -134,11 +135,11 @@ def prompter_reply_message(task: protocol_schema.PrompterReplyTask) -> str: {_h1("PROMPTER REPLY")} -{_response_prompt("Please provide a reply to the assistant.")} - {_conversation(task.conversation)} {_hint(task.hint)} + +{_response_prompt("Please provide a reply to the assistant.")} """ @@ -147,10 +148,10 @@ def assistant_reply_message(task: protocol_schema.AssistantReplyTask) -> str: return f"""\ {_h1("ASSISTANT REPLY")} -{_response_prompt("Please provide a reply to the assistant.")} - {_conversation(task.conversation)} + +{_response_prompt("Please provide a reply to the assistant.")} """ From 556391bf491054abc47f01f3eb40de2bdd652b6d Mon Sep 17 00:00:00 2001 From: Keith Stevens Date: Wed, 4 Jan 2023 17:20:51 +0900 Subject: [PATCH 044/121] Adding the Redis secret for running the backend on AWS --- copilot/api/manifest.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/copilot/api/manifest.yml b/copilot/api/manifest.yml index b9262b51..b6ff6cf7 100644 --- a/copilot/api/manifest.yml +++ b/copilot/api/manifest.yml @@ -36,3 +36,4 @@ environments: secrets: # Note: URI, not URL. DATABASE_URI: /copilot/${COPILOT_APPLICATION_NAME}/${COPILOT_ENVIRONMENT_NAME}/secrets/API_DATABASE_URL + REDIS_HOST: /copilot/${COPILOT_APPLICATION_NAME}/${COPILOT_ENVIRONMENT_NAME}/secrets/REDIS_HOST From ad67f66aee7b21cdb068f2513cb9f9f6aa285bbf Mon Sep 17 00:00:00 2001 From: Keith Stevens Date: Wed, 4 Jan 2023 18:09:39 +0900 Subject: [PATCH 045/121] Downgrading sharp to ensure it builds on mac m1 --- website/package-lock.json | 14 +++++++------- website/package.json | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/website/package-lock.json b/website/package-lock.json index 1803cbca..5cd3d9bb 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -39,7 +39,7 @@ "react": "18.2.0", "react-dom": "18.2.0", "react-icons": "^4.7.1", - "sharp": "^0.31.3", + "sharp": "0.31.2", "swr": "^2.0.0", "tailwindcss": "^3.2.4", "use-debounce": "^9.0.2" @@ -27907,9 +27907,9 @@ } }, "node_modules/sharp": { - "version": "0.31.3", - "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.31.3.tgz", - "integrity": "sha512-XcR4+FCLBFKw1bdB+GEhnUNXNXvnt0tDo4WsBsraKymuo/IAuPuCBVAL2wIkUw2r/dwFW5Q5+g66Kwl2dgDFVg==", + "version": "0.31.2", + "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.31.2.tgz", + "integrity": "sha512-DUdNVEXgS5A97cTagSLIIp8dUZ/lZtk78iNVZgHdHbx1qnQR7JAHY0BnXnwwH39Iw+VKhO08CTYhIg0p98vQ5Q==", "hasInstallScript": true, "dependencies": { "color": "^4.2.3", @@ -52121,9 +52121,9 @@ } }, "sharp": { - "version": "0.31.3", - "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.31.3.tgz", - "integrity": "sha512-XcR4+FCLBFKw1bdB+GEhnUNXNXvnt0tDo4WsBsraKymuo/IAuPuCBVAL2wIkUw2r/dwFW5Q5+g66Kwl2dgDFVg==", + "version": "0.31.2", + "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.31.2.tgz", + "integrity": "sha512-DUdNVEXgS5A97cTagSLIIp8dUZ/lZtk78iNVZgHdHbx1qnQR7JAHY0BnXnwwH39Iw+VKhO08CTYhIg0p98vQ5Q==", "requires": { "color": "^4.2.3", "detect-libc": "^2.0.1", diff --git a/website/package.json b/website/package.json index c1d0c3d2..c66e10ca 100644 --- a/website/package.json +++ b/website/package.json @@ -49,7 +49,7 @@ "react": "18.2.0", "react-dom": "18.2.0", "react-icons": "^4.7.1", - "sharp": "^0.31.3", + "sharp": "0.31.2", "swr": "^2.0.0", "tailwindcss": "^3.2.4", "use-debounce": "^9.0.2" From 733884943a49a33bdab0dad38dfe90625df8bdc3 Mon Sep 17 00:00:00 2001 From: Keith Stevens Date: Wed, 4 Jan 2023 18:14:35 +0900 Subject: [PATCH 046/121] Change the header logo link based on signed in status --- website/src/components/Header/Header.tsx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/website/src/components/Header/Header.tsx b/website/src/components/Header/Header.tsx index ed41936f..4ea453c6 100644 --- a/website/src/components/Header/Header.tsx +++ b/website/src/components/Header/Header.tsx @@ -21,6 +21,9 @@ function AccountButton() { } export function Header(props) { + const { data: session } = useSession(); + const homeURL = session ? "/dashboard" : "/"; + const { colorMode } = useColorMode(); const borderClass = props.transparent ? "" @@ -31,7 +34,7 @@ export function Header(props) {