Merge branch 'main' into 766_admin_enhancement

This commit is contained in:
notmd
2023-01-22 14:07:19 +07:00
33 changed files with 1186 additions and 38 deletions
+1
View File
@@ -54,6 +54,7 @@ class PublicSettings(pydantic.BaseModel):
PROJECT_NAME: str
API_V1_STR: str
MESSAGE_SIZE_LIMIT: int
DEBUG_USE_SEED_DATA: bool
DEBUG_ALLOW_SELF_LABELING: bool
DEBUG_SKIP_EMBEDDING_COMPUTATION: bool
+1 -1
View File
@@ -111,7 +111,7 @@ def get_users_cursor(
n = lt
return p, n
def remove_extra_item(items: list[protocol.FrontEndUser], lt: str | None, gt: str):
def remove_extra_item(items: list[protocol.FrontEndUser], lt: str | None, gt: str | None):
num_rows = len(items)
if qry_max_count > max_count and num_rows == qry_max_count:
assert not (lt and gt)
+1
View File
@@ -72,6 +72,7 @@ class Settings(BaseSettings):
DATABASE_MAX_TX_RETRY_COUNT: int = 3
RATE_LIMIT: bool = True
MESSAGE_SIZE_LIMIT: int = 2000
REDIS_HOST: str = "localhost"
REDIS_PORT: str = "6379"
+7
View File
@@ -465,6 +465,13 @@ class TreeManager:
f"Frontend reports text reply to {interaction.message_id=} with {interaction.text=} by {interaction.user=}."
)
# ensure message size is below the predefined limit
if len(interaction.text) > settings.MESSAGE_SIZE_LIMIT:
logger.error(
f"Message size {len(interaction.text)=} exceeds size limit of {settings.MESSAGE_SIZE_LIMIT=}."
)
raise OasstError("Message size too long.", OasstErrorCode.TASK_MESSAGE_TOO_LONG)
# here we store the text reply in the database
message = pr.store_text_reply(
text=interaction.text,
+18 -6
View File
@@ -153,7 +153,7 @@ class UserRepository:
if api_client_id != self.api_client.id:
raise OasstError("Forbidden", OasstErrorCode.API_CLIENT_NOT_AUTHORIZED, HTTP_403_FORBIDDEN)
qry = self.db.query(User).order_by(User.username, User.id)
qry = self.db.query(User)
if gte_username is not None:
if gt_id:
@@ -184,8 +184,14 @@ class UserRepository:
pattern = "%{}%".format(search_text.replace("\\", "\\\\").replace("_", "\\_").replace("%", "\\%"))
qry = qry.filter(User.username.like(pattern))
if limit is not None:
qry = qry.limit(limit)
if limit is not None and lte_username and not gte_username:
# select top rows but return results in ascernding order
sub_qry = qry.order_by(User.username.desc(), User.id.desc()).limit(limit).subquery("u")
qry = self.db.query(User).select_entity_from(sub_qry).order_by(User.username, User.id)
else:
qry = qry.order_by(User.username, User.id)
if limit is not None:
qry = qry.limit(limit)
return qry.all()
@@ -210,7 +216,7 @@ class UserRepository:
# Unprivileged api client asks for foreign users
raise OasstError("Forbidden", OasstErrorCode.API_CLIENT_NOT_AUTHORIZED, HTTP_403_FORBIDDEN)
qry = self.db.query(User).order_by(User.display_name, User.id)
qry = self.db.query(User)
if gte_display_name is not None:
if gt_id:
@@ -254,8 +260,14 @@ class UserRepository:
if auth_method:
qry = qry.filter(User.auth_method == auth_method)
if limit is not None:
qry = qry.limit(limit)
if limit is not None and lte_display_name and not gte_display_name:
# select top rows but return results in ascernding order
sub_qry = qry.order_by(User.display_name.desc(), User.id.desc()).limit(limit).subquery("u")
qry = self.db.query(User).select_entity_from(sub_qry).order_by(User.display_name, User.id)
else:
qry = qry.order_by(User.display_name, User.id)
if limit is not None:
qry = qry.limit(limit)
users = qry.all()
+35
View File
@@ -0,0 +1,35 @@
# OpenAssitant Inference
Preliminary implementation of the inference engine for OpenAssistant.
## Development (you'll need multiple terminals)
Run a redis container (or use the one of the general docker compose file):
```bash
docker run --rm -it -p 6379:6379 redis
```
Run the inference server:
```bash
cd server
pip install -r requirements.txt
uvicorn main:app --reload
```
Run one (or more) workers:
```bash
cd worker
pip install -r requirements.txt
python __main__.py
```
Run the client:
```bash
cd text-client
pip install -r requirements.txt
python __main__.py
```
+10
View File
@@ -0,0 +1,10 @@
# OpenAssistant Inference Server
Workers communicate with the `/work` endpoint via Websocket. They provide their
configuration and if a task is available, the server returns it. The worker then
performs the task and returns the result in a streaming fashion to the server,
also via websocket.
Clients first call `/chat` to make a new chat, then add to that via
`/chat/<id>/message`. The response is a SSE event source, which will send tokens
as they are available.
+193
View File
@@ -0,0 +1,193 @@
import asyncio
import enum
import uuid
import fastapi
import pydantic
import redis.asyncio as redis
from fastapi.middleware.cors import CORSMiddleware
from loguru import logger
from oasst_shared.schemas import inference, protocol
from sse_starlette.sse import EventSourceResponse
app = fastapi.FastAPI()
# Allow CORS
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
class Settings(pydantic.BaseSettings):
redis_host: str = "localhost"
redis_port: int = 6379
redis_db: int = 0
sse_retry_timeout: int = 15000
settings = Settings()
# create async redis client
redisClient = redis.Redis(
host=settings.redis_host, port=settings.redis_port, db=settings.redis_db, decode_responses=True
)
class CreateChatRequest(pydantic.BaseModel):
pass
class CreateChatResponse(pydantic.BaseModel):
id: str
class MessageRequest(pydantic.BaseModel):
message: str = pydantic.Field(..., repr=False)
model_name: str = "distilgpt2"
max_new_tokens: int = 100
def compatible_with(self, worker_config: inference.WorkerConfig) -> bool:
return self.model_name == worker_config.model_name
class TokenResponseEvent(pydantic.BaseModel):
token: str
class MessageRequestState(str, enum.Enum):
pending = "pending"
in_progress = "in_progress"
complete = "complete"
class DbChatEntry(pydantic.BaseModel):
id: str = pydantic.Field(default_factory=lambda: str(uuid.uuid4()))
conversation: protocol.Conversation = pydantic.Field(default_factory=protocol.Conversation)
pending_message_request: MessageRequest | None = None
message_request_state: MessageRequestState | None = None
# TODO: make real database
CHATS: dict[str, DbChatEntry] = {}
@app.post("/chat")
async def create_chat(request: CreateChatRequest) -> CreateChatResponse:
"""Allows a client to create a new chat."""
logger.info(f"Received {request}")
chat = DbChatEntry()
CHATS[chat.id] = chat
return CreateChatResponse(id=chat.id)
@app.get("/chat/{id}")
async def get_chat(id: str) -> protocol.Conversation:
"""Allows a client to get the current state of a chat."""
return CHATS[id].conversation
@app.post("/chat/{id}/message")
async def create_message(id: str, message_request: MessageRequest, fastapi_request: fastapi.Request):
"""Allows the client to stream the results of a request."""
chat = CHATS[id]
if not chat.conversation.is_prompter_turn:
raise fastapi.HTTPException(status_code=400, detail="Not your turn")
if chat.pending_message_request is not None:
raise fastapi.HTTPException(status_code=400, detail="Already pending")
chat.conversation.messages.append(
protocol.ConversationMessage(
text=message_request.message,
is_assistant=False,
)
)
chat.pending_message_request = message_request
chat.message_request_state = MessageRequestState.pending
async def event_generator():
result_data = []
try:
while True:
if await fastapi_request.is_disconnected():
logger.warning("Client disconnected")
break
item = await redisClient.blpop(chat.id, 1)
if item is None:
continue
_, response_packet_str = item
response_packet = inference.WorkResponsePacket.parse_raw(response_packet_str)
result_data.append(response_packet)
if response_packet.is_end:
break
yield {
"retry": settings.sse_retry_timeout,
"data": TokenResponseEvent(token=response_packet.token).json(),
}
logger.info(f"Finished streaming {chat.id} {len(result_data)=}")
except Exception:
logger.exception(f"Error streaming {chat.id}")
chat.conversation.messages.append(
protocol.ConversationMessage(
text="".join([d.token for d in result_data[:-1]]),
is_assistant=True,
)
)
chat.pending_message_request = None
return EventSourceResponse(event_generator())
@app.websocket("/work")
async def work(websocket: fastapi.WebSocket):
await websocket.accept()
worker_config = inference.WorkerConfig.parse_raw(await websocket.receive_text())
while True:
# find a pending task that matches the worker's config
# could also be implemented using task queues
# but general compatibility matching is tricky
for chat in CHATS.values():
if (request := chat.pending_message_request) is not None:
if chat.message_request_state == MessageRequestState.pending:
if request.compatible_with(worker_config):
break
else:
logger.debug("No pending tasks")
await asyncio.sleep(1)
continue
chat.message_request_state = MessageRequestState.in_progress
work_request = inference.WorkRequest(
conversation=chat.conversation,
model_name=request.model_name,
max_new_tokens=request.max_new_tokens,
)
logger.info(f"Created {work_request}")
try:
await websocket.send_text(work_request.json())
while True:
# maybe unnecessary to parse and re-serialize
# could just pass the raw string and mark end via empty string
response_packet = inference.WorkResponsePacket.parse_raw(await websocket.receive_text())
await redisClient.rpush(chat.id, response_packet.json())
if response_packet.is_end:
break
except fastapi.WebSocketException:
# TODO: handle this better
logger.exception(f"Websocket closed during handling of {chat.id}")
chat.message_request_state = MessageRequestState.complete
+6
View File
@@ -0,0 +1,6 @@
fastapi[all]
loguru
pydantic
redis
sse-starlette
websockets
+40
View File
@@ -0,0 +1,40 @@
"""Simple REPL frontend."""
import json
import requests
import sseclient
import typer
app = typer.Typer()
@app.command()
def main(backend_url: str = "http://127.0.0.1:8000"):
"""Simple REPL client."""
chat_id = requests.post(f"{backend_url}/chat", json={}).json()["id"]
while True:
message = typer.prompt("User").strip()
# wait for stream to be ready
# could implement a queue position indicator
# could be implemented with long polling
# but server load needs to be considered
response = requests.post(
f"{backend_url}/chat/{chat_id}/message",
json={"message": message},
stream=True,
headers={"Accept": "text/event-stream"},
)
response.raise_for_status()
client = sseclient.SSEClient(response)
print("Assistant: ", end="", flush=True)
for event in client.events():
data = json.loads(event.data)
print(data["token"], end="", flush=True)
print()
if __name__ == "__main__":
app()
+3
View File
@@ -0,0 +1,3 @@
requests
sseclient-py
typer
+79
View File
@@ -0,0 +1,79 @@
import re
import time
import rel
import torch
import typer
import websocket
from loguru import logger
from oasst_shared.schemas import inference, protocol
from transformers import pipeline
app = typer.Typer()
@app.command()
def main(
backend_url: str = "ws://localhost:8000",
model_name: str = "distilgpt2",
):
pipe = pipeline("text-generation", model=model_name)
def on_open(ws: websocket.WebSocket):
worker_config = inference.WorkerConfig(model_name=model_name)
ws.send(worker_config.json())
def on_message(ws: websocket.WebSocket, message: str):
# TODO: what if this comes in, but one is already in progress?
# also need to think of enabling batching
work_request = inference.WorkRequest.parse_raw(message)
def _prepare_message(message: protocol.ConversationMessage) -> str:
prefix = "Assistant: " if message.is_assistant else "User: "
return prefix + message.text
# construct prompt
messages = [_prepare_message(message) for message in work_request.conversation.messages]
prompt = "\n".join(messages) + "\nAssistant:"
# TODO: replace this with incremental generation
torch.manual_seed(work_request.seed)
model_output = pipe(prompt, max_new_tokens=work_request.max_new_tokens, do_sample=True, return_full_text=False)[
0
]["generated_text"]
model_output = model_output.strip()
# fake streaming
split_idcs = [m.start() for m in re.finditer(r"([\w:]+)", model_output)]
pieces = [model_output[a:b] for a, b in zip([0] + split_idcs, split_idcs + [None])]
for piece in pieces:
if not piece:
continue
if piece.strip() in ("User:", "Assistant:"):
break
ws.send(inference.WorkResponsePacket(token=piece).json())
time.sleep(0.1)
ws.send(inference.WorkResponsePacket(is_end=True).json())
def on_error(ws: websocket.WebSocket, error: Exception):
logger.error(f"Connection error: {error}")
def on_close(ws: websocket.WebSocket, close_status_code: int, close_msg: str):
logger.warning(f"Connection closed: {close_status_code=} {close_msg=}")
ws = websocket.WebSocketApp(
f"{backend_url}/work",
on_message=on_message,
on_error=on_error,
on_close=on_close,
on_open=on_open,
)
ws.run_forever(dispatcher=rel, reconnect=5)
rel.signal(2, rel.abort)
rel.dispatch()
if __name__ == "__main__":
app()
+6
View File
@@ -0,0 +1,6 @@
loguru
rel
torch
transformers
typer
websocket-client
@@ -37,6 +37,7 @@ class OasstErrorCode(IntEnum):
TASK_GENERATION_FAILED = 1005
TASK_REQUESTED_TYPE_NOT_AVAILABLE = 1006
TASK_AVAILABILITY_QUERY_FAILED = 1007
TASK_MESSAGE_TOO_LONG = 1008
# 2000-3000: prompt_repository
INVALID_FRONTEND_MESSAGE_ID = 2000
@@ -0,0 +1,21 @@
import random
import pydantic
from . import protocol
class WorkerConfig(pydantic.BaseModel):
model_name: str = "distilgpt2"
class WorkRequest(pydantic.BaseModel):
conversation: protocol.Conversation = pydantic.Field(..., repr=False)
model_name: str = "distilgpt2"
max_new_tokens: int = 100
seed: int = pydantic.Field(default_factory=lambda: random.randint(0, 2**32 - 1))
class WorkResponsePacket(pydantic.BaseModel):
token: str | None = None
is_end: bool = False
@@ -64,6 +64,18 @@ class Conversation(BaseModel):
messages: list[ConversationMessage] = []
def __len__(self):
return len(self.messages)
@property
def is_prompter_turn(self) -> bool:
if len(self) == 0:
return True
last_message = self.messages[-1]
if last_message.is_assistant:
return True
return False
class Message(ConversationMessage):
parent_id: Optional[UUID] = None
View File
@@ -0,0 +1,108 @@
---
annotations_creators:
- no-annotation
language:
- en
language_creators:
- machine-generated
license:
- mit
multilinguality:
- monolingual
pretty_name: "SODA Synthetic Dialogue"
size_categories:
- 1M<n<10M
source_datasets:
- extended|allenai/soda
tags:
- open-assistant
- conversations-summarization
task_categories:
- conversational
task_ids:
- dialogue-generation
dataset_info:
features:
- name: conversation
dtype: string
config_name: soda_synthetic_dialogue_dialogue_modeling
splits:
- name: train
num_bytes: 1524941476
num_examples: 1191582
- name: test
num_bytes: 190565707
num_examples: 148968
- name: validation
num_bytes: 187161878
num_examples: 146346
download_size: 1952926579
dataset_size: 1902669061
---
# Dataset Card for 🥤SODA Synthetic Dialogue
## Dataset Description
- **Repository:**
[Code](https://github.com/LAION-AI/Open-Assistant/openassistant/datasets/soda_synthetic_dialogue/)
## Dataset Summary
🥤SODA Synthetic Dialogue is a set of synthetic dialogues between Assistant and
User. In each conversation, User asks Assistant to perform summarization or
story generation tasks based on a snippet of an existing dialogue, story, or
from a title or theme.
This data was created by synthesizing the dialogues in
[🥤Soda](https://huggingface.co/datasets/allenai/soda) and applying a set of
templates to generate the conversation. The original research paper can be found
[here](https://arxiv.org/pdf/2212.10465v1.pdf).
Example:
```
User: Can you write the next few lines of dialogue for this scene:
Cornell: I know what you're thinking, Dontrell. I can sense your feelings and know exactly what you're going to say or do next. You're thinking about how much you miss your family, and how you wish you could see them again. You're wondering if there's anything you can do to make that happen. And you're also feeling a little guilty, because you know that if you could see them again, it would only be for a short while before they died.
Dontrell: Cornell, how can you possibly know all that? It's like you can read my mind!
Assistant: Sure, the next dialogue for this scene could be:
Cornell: I told you, I can sense your feelings. It's like I have a connection to your thoughts and emotions. I'm not sure why or how it works, but ever since we met, I've been able to understand what you're thinking and feeling pretty easily.
Dontrell: Wow...that's amazing! And a little bit scary too. But it does explain why we've always been so close - even though we've only known each other for a few months.
Cornell: Yeah, I guess our connection is pretty strong. But don't worry, Dontrell - I'll never use it to manipulate or control you in any way. I promise."
Dontrell: That's a relief! I was starting to feel a little bit like you could read my mind, and it was kind of creepy. But if you promise not to use your abilities to control me, then I'm okay with it.
Cornell: I swear, Dontrell. I'll never use my abilities to control or manipulate you in any way. You have my word.
User: And a title?
Assistant: Sure, a title for this dialogue could be:
Cornell is intuitive. Cornell reads Dontrell's mind.
User: How about a short description?
Assistant: Sure, a short description for this dialogue could be:
Cornell knows what Dontrell is thinking. He can sense her feelings and know exactly what she is going to say or do next.
User: What would be one theme of this story?
Assistant: One theme of this story could be:
being intuitive
```
## Usage
The dataset contains one configuration, `dialogue_modeling`, which has a single
text `conversation` feature.
## Source data
The script to synthesize this dataset was originally created by
[ontocord](https://github.com/ontocord) in
[this Colab notebook](https://colab.research.google.com/drive/1Sw3px5dP8whdqT7QMNoqwmqIasZkMbJi?usp=sharing)
and prepared for Hugging Face by
[Jeffrey Quesnelle](https://github.com/jquesnelle/).
## Citation
Please cite our work if you find the resources in this repository useful:
```
@article{ontocord2023sodasynth,
author = {ontocord and Jeffrey Quesnelle},
title = {SODA Synthetic Dialogue},
year = {2023}
}
```
@@ -0,0 +1,21 @@
from dataclasses import dataclass
import datasets
@dataclass
class OpenAssistantConfig(datasets.BuilderConfig):
"""BuilderConfig for OpenAssistant datasets."""
name: str = None
version: datasets.Version = None
description: str = None
schema: str = None
subset_id: str = None
features = datasets.Features(
{
"conversation": datasets.Value("string"),
}
)
@@ -0,0 +1,246 @@
"""Prepare the SODA Synthetic Dialogue Dataset"""
import json
import os
import random
import sys
from datasets import load_dataset
from tqdm import tqdm
# adapted from https://colab.research.google.com/drive/1Sw3px5dP8whdqT7QMNoqwmqIasZkMbJi?usp=sharing
SUMMARY_TEMPLATE = """User: Can you give me a short story description for this dialogue?
{dialogue}
Assistant: Sure, a short story description for this dialogue could be:
{story}
User: And a title?
Assistant: Sure, a title for this dialogue could be:
{title}"""
THEME_TEMPLATE = """
User: What would be one theme of this story?
Assistant: One theme of this story could be:
{theme}"""
NEW_DIALOGUE_TEMPLATE = """User: Can you write a short dialogue based on this story:
{story}
Assistant: Sure, a dialogue for this story could be:
{dialogue}
User: And a title?
Assistant: Sure, a title for this dialogue could be:
{title}"""
NEXT_LINES_TEMPLATE = """User: Can you write the next few lines of dialogue for this scene:
{scene}
Assistant: Sure, the next dialogue for this scene could be:
{dialogue}
User: And a title?
Assistant: Sure, a title for this dialogue could be:
{title}
User: How about a short description?
Assistant: Sure, a short description for this dialogue could be:
{story}"""
NEW_STORY_AND_DIALOGUE_TEMPLATE = """User: Can you write a short story and dialogue about:
{title1}
Assistant: Sure, a short story and dialogue about: "{title1}" could be:
{story}"""
FULL_DIALOGUE_TEMPLATE = """{conversation}
{dialogue}"""
MORE_DIALOGUE_TEMPLATE = """{conversation}
{dialogue1}
User: Can you provide more dialogue assuming "{title2}"?
Assistant: Sure, the next dialogue for this scene could be:
{dialogue2}"""
NEXT_DIALOGUE_TEMPLATE = """{conversation}
{dialogue1}
User: More please.
Assistant: Sure, the next dialogue for this scene could be:
{dialogue2}"""
NEW_STORY_AND_DIALOGUE_FROM_THEME_TEMPLATE = """User: Can you write short story and dialogue based on the theme:
{theme}
Assistant: Sure, a short story and dialogue based on the theme "{theme}" could be:
{story}
{dialogue}
User: And a title?
Assistant: Sure, a title for this dialogue could be:
{title}"""
PRINT = len(sys.argv) > 1 and sys.argv[1] == "--print"
def main(output_dir: str = "data"):
"""Download and prepare the dataset for use."""
random.seed(42)
dataset = load_dataset("allenai/soda")
os.makedirs(output_dir, exist_ok=True)
for split in ["train", "test", "validation"]:
with open(f"{output_dir}/{split}.jsonl", "w", encoding="utf8") as output:
for i in tqdm(range(len(dataset[split])), desc=split):
dat = dataset["train"][i]
title = dat["literal"]
story = dat["narrative"]
if dat["relation"] == "xWant":
theme = "wanting " + dat["tail"]
elif dat["relation"] == "xNeed":
theme = "needing " + dat["tail"]
elif not dat["tail"].startswith("to ") and not dat["tail"].startswith("and "):
theme = "being " + dat["tail"]
elif dat["tail"].startswith("and "):
theme = "people are " + dat["tail"].replace("and PersonY ", "")
else:
theme = dat["tail"]
theme = theme.replace("PersonY", "another person")
theme = theme.replace("being is", "being")
dialogue = [s2 + ": " + s1 for s1, s2 in zip(dat["dialogue"], dat["speakers"])]
if random.randint(0, 6) == 0:
# print("##")
# print(f"User: Can you give me a short story description for this dialog?")
# print(" " + "\n ".join(dialog))
# print(f"Assistant: Sure, a short story description for this dialog could be: \n {story}")
# print("User: And a title?")
# print(f"Assistant: Sure, a title for this dialog could be: \n {title}")
# if theme:
# print("User: What would be one theme of this story?")
# print(f'Assistant: One theme of this story could be: "{theme}"')
conversation = SUMMARY_TEMPLATE.format(dialogue="\n ".join(dialogue), story=story, title=title)
if theme:
conversation = conversation + THEME_TEMPLATE.format(theme=theme)
elif random.randint(0, 6) == 0:
# print("##")
# print(f"User: Can you write a short dialog based on this story:\n {story}")
# print(f"Assistant: Sure, a dialog for this story could be:")
# print(" " + "\n ".join(dialog))
# print("User: And a title?")
# print(f"Assistant: Sure, a title for this dialog could be: \n {title}")
# if theme:
# print("User: What would be one theme of this story?")
# print(f'Assistant: One theme of this story could be: "{theme}"')
conversation = NEW_DIALOGUE_TEMPLATE.format(
story=story, dialogue="\n ".join(dialogue), title=title
)
if theme:
conversation = conversation + THEME_TEMPLATE.format(theme=theme)
elif random.randint(0, 3) == 0:
# print("##")
# print(f"User: Can you write the next few lines of dialog for this scene:")
# if random.randint(0, 1) == 0:
# print(" " + "\n ".join(dialog[:-5]))
# print(f"Assistant: Sure, the next dialog for this scene could be:")
# print(" " + "\n ".join(dialog[-5:]))
# elif random.randint(0, 1) == 0:
# print(" " + "\n ".join(dialog[:-3]))
# print(f"Assistant: Sure, the next dialog for this scene could be:")
# print(" " + "\n ".join(dialog[-3:]))
# else:
# print(" " + "\n ".join(dialog[:-4]))
# print(f"Assistant: Sure, the next dialog for this scene could be:")
# print(" " + "\n ".join(dialog[-4:]))
# print("User: And a title?")
# print(f"Assistant: Sure, a title for this dialog could be: \n {title}")
# print("User: How about a short description?")
# print(f"Assistant: Sure, a short description for this dialog could be: \n {story}")
# if theme:
# print("User: What would be one theme of this story?")
# print(f'Assistant: One theme of this story could be: "{theme}"')
if random.randint(0, 1) == 0:
depth = -5
elif random.randint(0, 1) == 0:
depth = -3
else:
depth = -4
conversation = NEXT_LINES_TEMPLATE.format(
scene="\n ".join(dialogue[:depth]),
dialogue="\n ".join(dialogue[depth:]),
title=title,
story=story,
)
if theme:
conversation = conversation + THEME_TEMPLATE.format(theme=theme)
elif random.randint(0, 3) == 0:
# print("##")
# title1 = title.split(".")[0]
# title2 = title.split(".")[1]
# print(f"User: Can you write short story and dialog about: {title1}")
# print(f'Assistant: Sure, a short story and dialog about: "{title1}" could be:')
# print(f" {story}")
# if random.randint(0, 1) == 0:
# print(" " + "\n ".join(dialog))
# elif random.randint(0, 1) == 0 and len(dialog) > 5:
# print(" " + "\n ".join(dialog[:-5]))
# print(f'User: Can you provide more dialog assuming "{title2}"?')
# print(f"Assistant: Sure, the next dialog for this scene could be:")
# print(" " + "\n ".join(dialog[-5:]))
# elif random.randint(0, 1) == 0:
# print(" " + "\n ".join(dialog[:-3]))
# print("User: more please.")
# print(f"Assistant: Sure, the next dialog for this scene could be:")
# print(" " + "\n ".join(dialog[-3:]))
# else:
# print(" " + "\n ".join(dialog[:-4]))
# print(f'User: Can you provide more dialog assuming "{title2}"?')
# print(f"Assistant: Sure, the next dialog for this scene could be:")
# print(" " + "\n ".join(dialog[-4:]))
# if theme:
# print("User: What would be one theme of this story?")
# print(f'Assistant: One theme of this story could be: "{theme}"')
title1 = title.split(".")[0]
title2 = title.split(".")[1]
conversation = NEW_STORY_AND_DIALOGUE_TEMPLATE.format(title1=title1, story=story)
if random.randint(0, 1) == 0:
conversation = FULL_DIALOGUE_TEMPLATE.format(
conversation=conversation, dialogue="\n ".join(dialogue)
)
elif random.randint(0, 1) == 0 and len(dialogue) > 5:
conversation = MORE_DIALOGUE_TEMPLATE.format(
conversation=conversation,
dialogue1="\n ".join(dialogue[:-5]),
title2=title2,
dialogue2="\n ".join(dialogue[-5:]),
)
elif random.randint(0, 1) == 0:
conversation = NEXT_DIALOGUE_TEMPLATE.format(
conversation=conversation,
dialogue1="\n ".join(dialogue[:-3]),
dialogue2="\n ".join(dialogue[-3:]),
)
else:
conversation = MORE_DIALOGUE_TEMPLATE.format(
conversation=conversation,
dialogue1="\n ".join(dialogue[:-4]),
title2=title2,
dialogue2="\n ".join(dialogue[-4:]),
)
if theme:
conversation = conversation + THEME_TEMPLATE.format(theme=theme)
else:
# print("##")
# print(f"User: Can you write short story and dialog based on the theme:\n {theme}")
# print(f'Assistant: Sure, a short story and dialog based on the theme "{theme}" could be:')
# print(f" {story}")
# print(" " + "\n ".join(dialog))
# print("User: And a title?")
# print(f"Assistant: Sure, a title for this dialog could be: \n {title}")
conversation = NEW_STORY_AND_DIALOGUE_FROM_THEME_TEMPLATE.format(
theme=theme, story=story, dialogue="\n ".join(dialogue), title=title
)
if PRINT:
print("##")
print(conversation)
output.write(f"{json.dumps({'conversation': conversation})}\n")
if __name__ == "__main__":
sys.exit(main())
@@ -0,0 +1,108 @@
# Copyright 2023 The OpenAssistant Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This dataset is a set of dialogues synthesized from the SODA dataset.
In each dialogue, User and Assistant have a conversation about a story.
The original collab notebook for this dataset can be found at:
https://colab.research.google.com/drive/1Sw3px5dP8whdqT7QMNoqwmqIasZkMbJi?usp=sharing
"""
import json
from typing import Dict, List, Tuple
import datasets
from .hub import OpenAssistantConfig, features
_CITATION = """\
@article{ontocord2023sodasynth,
author = {ontocord and Jeffrey Quesnelle},
title = {SODA Synthetic Dialogue},
year = {2023}
}
"""
_DATASETNAME = "soda_synthetic_dialogue"
_DISPLAYNAME = "🥤SODA Synthetic Dialogue"
_DESCRIPTION = "A set of dialogues synthesized from the SODA dataset."
_HOMEPAGE = ""
_LICENSE = "mit"
_URLS = {
_DATASETNAME: {"train": "./data/train.jsonl", "test": "./data/test.jsonl", "validation": "./data/validation.jsonl"}
}
_SUPPORTED_TASKS = ["dialogue-modeling"]
_VERSION = "1.0.0"
class SODASyntheticDialogueDataset(datasets.GeneratorBasedBuilder):
"""A set of dialogues synthesized from the SODA dataset."""
VERSION = datasets.Version(_VERSION)
BUILDER_CONFIGS = [
OpenAssistantConfig(
name=f"{_DATASETNAME}_dialogue_modeling",
version=VERSION,
description=f"OpenAssistant dataset config for {_DATASETNAME}",
schema="dialogue_modeling",
subset_id=_DATASETNAME,
)
]
DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_dialogue_modeling"
def _info(self) -> datasets.DatasetInfo:
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
"""Returns SplitGenerators."""
urls = _URLS[_DATASETNAME]
data_dir = dl_manager.download_and_extract(urls)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={"filepath": data_dir, "split": "train"},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={"filepath": data_dir, "split": "test"},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={"filepath": data_dir, "split": "validation"},
),
]
def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]:
"""Yields examples as (key, example) tuples."""
if self.config.schema == "dialogue_modeling":
key = 0
with open(filepath[split], "r", encoding="utf8") as data:
while True:
line = data.readline()
if not line:
return
yield key, json.loads(line)
key += 1
+1 -2
View File
@@ -13,6 +13,5 @@
"sign_in": "Sign In",
"sign_out": "Sign Out",
"terms_of_service": "Terms of Service",
"title": "Open Assistant",
"last_updated_at": "Last updated at: {{val, datetime}}"
"title": "Open Assistant"
}
+6 -7
View File
@@ -1,16 +1,15 @@
{
"title": "Open Assistant",
"subtitle": "Conversational AI for everyone.",
"description": "Conversational AI for everyone. An open source project to create a chat enabled GPT LLM run by LAION and contributors around the world.",
"blurb": "We believe we can create a revolution.",
"blurb1": "In the same way that Stable Diffusion helped the world make art and images in new ways, we want to improve the world by providing amazing conversational AI.",
"join_us_title": "Join us",
"join_us_description": "All open source projects begin with people like you. Open source is the belief that if we collaborate we can together gift our knowledge and technology to the world for the benefit of humanity. Are you in? Find us here:",
"faq_title": "Frequently Asked Questions",
"description": "Conversational AI for everyone. An open source project to create a chat enabled GPT LLM run by LAION and contributors around the world.",
"faq_items": {
"q0": "How far along is this project?",
"a0": "We are in the early stages of development, working from established research in applying RLHF to large language models.",
"q1": "Who is behind Open Assistant?",
"a1": "Open Assistant is a project organized by LAION and individuals around the world interested in bringing this technology to everyone."
}
},
"faq_title": "Frequently Asked Questions",
"join_us_description": "All open source projects begin with people like you. Open source is the belief that if we collaborate we can together gift our knowledge and technology to the world for the benefit of humanity. Are you in? Find us here:",
"join_us_title": "Join us",
"subtitle": "Conversational AI for everyone."
}
@@ -0,0 +1,11 @@
{
"daily": "Daily",
"last_updated_at": "Last updated at: {{val, datetime}}",
"leaderboard": "Leaderboard",
"monthly": "Monthly",
"overall": "Overall",
"rank": "Rank",
"score": "Score",
"user": "User",
"weekly": "Weekly"
}
+2 -2
View File
@@ -6,7 +6,7 @@ import { AnimatedCircles } from "./AnimatedCircles";
import { Container } from "./Container";
export function Hero() {
const { t } = useTranslation("index");
const { t } = useTranslation(["index", "common"]);
const { colorMode } = useColorMode();
const pTextColor = colorMode === "light" ? "text-gray-600" : "text-white";
const fancyTextGradientClasses =
@@ -17,7 +17,7 @@ export function Hero() {
<Box className="lg:grid lg:grid-cols-12 lg:gap-x-8 lg:gap-y-20">
<Box className="relative mx-auto max-w-2xl lg:col-span-7 lg:max-w-none lg:pt-6 xl:col-span-6">
<Text as="h1" className="text-5xl mb-6 font-bold tracking-tight">
{t("title")}
{t("common:title")}
</Text>
<Text
as="h2"
+7 -1
View File
@@ -2,7 +2,7 @@
import { Box, Grid } from "@chakra-ui/react";
import type { NextPage } from "next";
import { FiBarChart2, FiLayout, FiMessageSquare, FiUsers } from "react-icons/fi";
import { FiBarChart2, FiLayout, FiMessageSquare, FiUsers, FiActivity } from "react-icons/fi";
import { Header } from "src/components/Header";
import { SlimFooter } from "./Dashboard/SlimFooter";
@@ -75,6 +75,12 @@ export const getAdminLayout = (page: React.ReactElement) => (
desc: "Users Dashboard",
icon: FiUsers,
},
{
label: "Status",
pathname: "/admin/status",
desc: "Status Dashboard",
icon: FiActivity,
},
]}
>
{page}
@@ -6,19 +6,19 @@ import { get } from "src/lib/api";
import { LeaderboardReply, LeaderboardTimeFrame } from "src/types/Leaderboard";
import useSWRImmutable from "swr/immutable";
const columns = [
const getColumns = (t) => [
{
Header: "Rank",
Header: t("rank"),
accessor: "rank",
style: { width: "90px" },
},
{
Header: "Score",
Header: t("score"),
accessor: "leader_score",
style: { width: "90px" },
},
{
Header: "User",
Header: t("user"),
accessor: "display_name",
},
];
@@ -27,11 +27,13 @@ const columns = [
* Presents a grid of leaderboard entries with more detailed information.
*/
const LeaderboardGridCell = ({ timeFrame }: { timeFrame: LeaderboardTimeFrame }) => {
const { t } = useTranslation();
const { t } = useTranslation(["leaderboard", "common"]);
const { data: reply } = useSWRImmutable<LeaderboardReply>(`/api/leaderboard?time_frame=${timeFrame}`, get, {
revalidateOnMount: true,
});
const columns = useMemo(() => getColumns(t), [t]);
const { getTableProps, getTableBodyProps, headerGroups, rows, prepareRow } = useTable({
columns,
data: reply?.leaderboard ?? [],
+21
View File
@@ -164,6 +164,27 @@ export class OasstApiClient {
});
}
/**
* Returns the tasks availability information for given `user`.
*/
async fetch_tasks_availability(user: object): Promise<any> {
return this.post("/api/v1/tasks/availability", user);
}
/**
* Returns the message stats from the backend.
*/
async fetch_stats(): Promise<any> {
return this.get("/api/v1/stats/");
}
/**
* Returns the tree manager stats from the backend.
*/
async fetch_tree_manager(): Promise<any> {
return this.get("/api/v1/stats/tree_manager");
}
/**
* Returns the `BackendUser` associated with `user_id`
*/
+174
View File
@@ -0,0 +1,174 @@
import {
Box,
Card,
CardBody,
CircularProgress,
SimpleGrid,
Text,
Table,
TableCaption,
TableContainer,
Tbody,
Td,
Th,
Thead,
Tr,
useColorMode,
} from "@chakra-ui/react";
import Head from "next/head";
import { useRouter } from "next/router";
import { useSession } from "next-auth/react";
import { useEffect } from "react";
import useSWRImmutable from "swr/immutable";
import { getAdminLayout } from "src/components/Layout";
import { get } from "src/lib/api";
/**
* Provides the admin status page that shows result of calls to several backend API endpoints,
* namely /api/v1/tasks/availability, /api/v1/stats/, /api/v1/stats/tree_manager
*/
const StatusIndex = () => {
const router = useRouter();
const { data: session, status } = useSession();
const { colorMode } = useColorMode();
const dataBackgroundColor = colorMode === "light" ? "gray.100" : "gray.800";
// Check when the user session is loaded and re-route if the user is not an
// admin. This follows the suggestion by NextJS for handling private pages:
// https://nextjs.org/docs/api-reference/next/router#usage
//
// All admin pages should use the same check and routing steps.
useEffect(() => {
if (status === "loading") {
return;
}
if (session?.user?.role === "admin") {
return;
}
router.push("/");
}, [router, session, status]);
const {
data: dataStatus,
error: errorStatus,
isLoading: isLoadingStatus,
} = useSWRImmutable("/api/admin/status", get);
const { tasksAvailability, stats, treeManager } = dataStatus || {};
return (
<>
<Head>
<title>Status - Open Assistant</title>
<meta
name="description"
content="Conversational AI for everyone. An open source project to create a chat enabled GPT LLM run by LAION and contributors around the world."
/>
</Head>
<SimpleGrid columns={[1, 1, 1, 1, 1, 2]} gap={4}>
<Card>
<CardBody>
<Text as="h1" fontSize="3xl" textAlign="center">
/api/v1/tasks/availability
</Text>
<Box bg={dataBackgroundColor} borderRadius="xl" p="6" pt="4" pr="12">
{tasksAvailability?.status === "fulfilled" ? (
<pre>{JSON.stringify(tasksAvailability.value, null, 2)}</pre>
) : tasksAvailability?.status === "rejected" ? (
<pre>{JSON.stringify(tasksAvailability.reason, null, 2)}</pre>
) : errorStatus ? (
<pre>{JSON.stringify(errorStatus, null, 2)}</pre>
) : (
<CircularProgress isIndeterminate />
)}
</Box>
</CardBody>
</Card>
<Card>
<CardBody>
<Text as="h1" fontSize="3xl" textAlign="center">
/api/v1/stats/
</Text>
<Box bg={dataBackgroundColor} borderRadius="xl" p="6" pt="4" pr="12">
{stats?.status === "fulfilled" ? (
<pre>{JSON.stringify(stats.value, null, 2)}</pre>
) : stats?.status === "rejected" ? (
<pre>{JSON.stringify(stats.reason, null, 2)}</pre>
) : errorStatus ? (
<pre>{JSON.stringify(errorStatus, null, 2)}</pre>
) : (
<CircularProgress isIndeterminate />
)}
</Box>
</CardBody>
</Card>
</SimpleGrid>
<br />
<Card>
<CardBody>
<Text as="h1" fontSize="3xl" textAlign="center">
/api/v1/stats/tree_manager
</Text>
{treeManager?.status === "fulfilled" ? (
<Box>
<Text as="h2" fontSize="2xl">
state_counts
</Text>
<Box bg={dataBackgroundColor} borderRadius="xl" p="6" pt="4" pr="12">
<pre>{JSON.stringify(treeManager.value.state_counts, null, 2)}</pre>
</Box>
<TableContainer>
<br />
<Text as="h2" fontSize="2xl">
message_counts
</Text>
<Table variant="simple">
<TableCaption>Tree Manager</TableCaption>
<Thead>
<Tr>
<Th>Message Tree ID</Th>
<Th>State</Th>
<Th>Depth</Th>
<Th>Oldest</Th>
<Th>Youngest</Th>
<Th>Count</Th>
<Th>Goal Tree Size</Th>
</Tr>
</Thead>
<Tbody>
{treeManager.value.message_counts.map(
({ message_tree_id, state, depth, oldest, youngest, count, goal_tree_size }) => (
<Tr key={message_tree_id}>
<Td>{message_tree_id}</Td>
<Td>{state}</Td>
<Td>{depth}</Td>
<Td>{oldest}</Td>
<Td>{youngest}</Td>
<Td>{count}</Td>
<Td>{goal_tree_size}</Td>
</Tr>
)
)}
</Tbody>
</Table>
</TableContainer>
</Box>
) : treeManager?.status === "rejected" ? (
<pre>{JSON.stringify(treeManager.reason, null, 2)}</pre>
) : errorStatus ? (
<pre>{JSON.stringify(errorStatus, null, 2)}</pre>
) : (
<CircularProgress isIndeterminate />
)}
</CardBody>
</Card>
</>
);
};
StatusIndex.getLayout = getAdminLayout;
export default StatusIndex;
+30
View File
@@ -0,0 +1,30 @@
import { getToken } from "next-auth/jwt";
import { withRole } from "src/lib/auth";
import { oasstApiClient } from "src/lib/oasst_api_client";
import { getBackendUserCore } from "src/lib/users";
/**
* Returns tasks availability, stats, and tree manager stats.
*/
const handler = withRole("admin", async (req, res) => {
const dummyUser = {
id: "__dummy_user__",
display_name: "Dummy User",
auth_method: "local",
};
const [tasksAvailabilityOutcome, statsOutcome, treeManagerOutcome] = await Promise.allSettled([
oasstApiClient.fetch_tasks_availability(dummyUser),
oasstApiClient.fetch_stats(),
oasstApiClient.fetch_tree_manager(),
]);
const status = {
tasksAvailability: tasksAvailabilityOutcome,
stats: statsOutcome,
treeManager: treeManagerOutcome,
};
res.status(200).json(status);
});
export default handler;
+2 -8
View File
@@ -3,17 +3,17 @@ import Head from "next/head";
import { useRouter } from "next/router";
import { useSession } from "next-auth/react";
import { useTranslation } from "next-i18next";
import { serverSideTranslations } from "next-i18next/serverSideTranslations";
import { useEffect } from "react";
import { CallToAction } from "src/components/CallToAction";
import { Faq } from "src/components/Faq";
import { Hero } from "src/components/Hero";
import { getTransparentHeaderLayout } from "src/components/Layout";
export { getDefaultStaticProps as getStaticProps } from "src/lib/default_static_props";
const Home = () => {
const router = useRouter();
const { status } = useSession();
const { t } = useTranslation("index");
const { t } = useTranslation();
useEffect(() => {
if (status === "authenticated") {
router.push("/dashboard");
@@ -37,10 +37,4 @@ const Home = () => {
Home.getLayout = getTransparentHeaderLayout;
export const getStaticProps = async ({ locale }) => ({
props: {
...(await serverSideTranslations(locale, ["index", "common"])),
},
});
export default Home;
+8 -6
View File
@@ -1,27 +1,29 @@
import { Box, Heading, Tab, TabList, TabPanel, TabPanels, Tabs } from "@chakra-ui/react";
import Head from "next/head";
import { useTranslation } from "next-i18next";
import { getDashboardLayout } from "src/components/Layout";
import { LeaderboardGridCell } from "src/components/LeaderboardGridCell";
export { getDefaultStaticProps as getStaticProps } from "src/lib/default_static_props";
import { LeaderboardTimeFrame } from "src/types/Leaderboard";
const Leaderboard = () => {
const { t } = useTranslation(["leaderboard", "common"]);
return (
<>
<Head>
<title>Leaderboard - Open Assistant</title>
<title>{`${t("leaderboard")} - ${t("common:title")}`}</title>
<meta name="description" content="Leaderboard Rankings" charSet="UTF-8" />
</Head>
<Box display="flex" flexDirection="column">
<Heading fontSize="2xl" fontWeight="bold" pb="4">
Leaderboard
{t("leaderboard")}
</Heading>
<Tabs isFitted isLazy>
<TabList>
<Tab>Daily</Tab>
<Tab>Weekly</Tab>
<Tab>Monthly</Tab>
<Tab>Overall</Tab>
<Tab>{t("daily")}</Tab>
<Tab>{t("weekly")}</Tab>
<Tab>{t("monthly")}</Tab>
<Tab>{t("overall")}</Tab>
</TabList>
<TabPanels>