diff --git a/.devcontainer/README.md b/.devcontainer/README.md index c93a9660..f7e4eae6 100644 --- a/.devcontainer/README.md +++ b/.devcontainer/README.md @@ -19,7 +19,7 @@ devcontainers in this repo. pre-commit run --all-files ``` -A successfull run should look something like this: +A successful run should look something like this: ``` @andrewm4894 ➜ /workspaces/Open-Assistant (devcontainer-improvements) $ pre-commit run --all-files diff --git a/.gitattributes b/.gitattributes index 6313b56c..7bc47e3f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1,2 @@ * text=auto eol=lf +*.ipynb linguist-documentation diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b31d1e8f..ced8fcdd 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -76,7 +76,12 @@ repos: rev: v2.7.1 hooks: - id: prettier - args: [--prose-wrap=always, --write] + args: + [ + --prose-wrap=always, + --write, + --ignore-path=./website/.prettierignore, + ] - repo: local hooks: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 56dfa22c..39773b41 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -50,12 +50,12 @@ contributions smoothly we recommend the following: [Here](https://github.com/LAION-AI/Open-Assistant/pull/658) is an example PR for this project to illustrate this flow. 1. If you're lucky, we can merge your change into `main` without any problems. - If there's changes to files you're working on, resolve them by: -1. First try rebase as suggested - [in these instructions](https://timwise.co.uk/2019/10/14/merge-vs-rebase/#should-you-rebase). -1. If rebase feels too painful, merge as suggested - [in these instructions](https://timwise.co.uk/2019/10/14/merge-vs-rebase/#should-you-merge). -1. Once you've resolved any conflicts, finish the review and + If there's changes to files you're working on, resolve them by : + 1. First try rebase as suggested + [in these instructions](https://timwise.co.uk/2019/10/14/merge-vs-rebase/#should-you-rebase). + 1. If rebase feels too painful, merge as suggested + [in these instructions](https://timwise.co.uk/2019/10/14/merge-vs-rebase/#should-you-merge). +1. Once you've resolved conflicts (if any), finish the review and [squash and merge](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/incorporating-changes-from-a-pull-request/about-pull-request-merges#squash-and-merge-your-commits) your PR (when squashing try to clean up or update the individual commit messages to be one sensible single one). diff --git a/backend/alembic/versions/2023_01_24_1134-8ba17b5f467a_add_message_id_to_message_reaction.py b/backend/alembic/versions/2023_01_24_1134-8ba17b5f467a_add_message_id_to_message_reaction.py new file mode 100644 index 00000000..8b877a9f --- /dev/null +++ b/backend/alembic/versions/2023_01_24_1134-8ba17b5f467a_add_message_id_to_message_reaction.py @@ -0,0 +1,34 @@ +"""add message_id to message_reaction + +Revision ID: 8ba17b5f467a +Revises: 160ac010efcc +Create Date: 2023-01-24 11:34:42.167575 + +""" +import sqlalchemy as sa +import sqlmodel +from alembic import op + +# revision identifiers, used by Alembic. +revision = "8ba17b5f467a" +down_revision = "160ac010efcc" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column("message_reaction", sa.Column("message_id", sqlmodel.sql.sqltypes.GUID(), nullable=True)) + op.create_index(op.f("ix_message_reaction_message_id"), "message_reaction", ["message_id"], unique=False) + op.add_column("text_labels", sa.Column("task_id", sqlmodel.sql.sqltypes.GUID(), nullable=True)) + op.create_index(op.f("ix_text_labels_task_id"), "text_labels", ["task_id"], unique=False) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f("ix_text_labels_task_id"), table_name="text_labels") + op.drop_column("text_labels", "task_id") + op.drop_index(op.f("ix_message_reaction_message_id"), table_name="message_reaction") + op.drop_column("message_reaction", "message_id") + # ### end Alembic commands ### diff --git a/backend/alembic/versions/2023_01_24_2256-40ed93df0ed5_add_message_emoji.py b/backend/alembic/versions/2023_01_24_2256-40ed93df0ed5_add_message_emoji.py new file mode 100644 index 00000000..17368c0d --- /dev/null +++ b/backend/alembic/versions/2023_01_24_2256-40ed93df0ed5_add_message_emoji.py @@ -0,0 +1,44 @@ +"""add message_emoji + +Revision ID: 40ed93df0ed5 +Revises: 8ba17b5f467a +Create Date: 2023-01-24 22:56:28.229408 + +""" +import sqlalchemy as sa +import sqlmodel +from alembic import op +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "40ed93df0ed5" +down_revision = "8ba17b5f467a" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "message_emoji", + sa.Column("message_id", postgresql.UUID(as_uuid=True), nullable=False), + sa.Column("user_id", postgresql.UUID(as_uuid=True), nullable=False), + sa.Column( + "created_date", sa.DateTime(timezone=True), server_default=sa.text("CURRENT_TIMESTAMP"), nullable=False + ), + sa.Column("emoji", sqlmodel.sql.sqltypes.AutoString(length=128), nullable=False), + sa.ForeignKeyConstraint(["message_id"], ["message.id"], ondelete="CASCADE"), + sa.ForeignKeyConstraint(["user_id"], ["user.id"], ondelete="CASCADE"), + sa.PrimaryKeyConstraint("message_id", "user_id", "emoji"), + ) + op.create_index("ix_message_emoji__user_id__message_id", "message_emoji", ["user_id", "message_id"], unique=False) + op.add_column("message", sa.Column("emojis", postgresql.JSONB(astext_type=sa.Text()), nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column("message", "emojis") + op.drop_index("ix_message_emoji__user_id__message_id", table_name="message_emoji") + op.drop_table("message_emoji") + # ### end Alembic commands ### diff --git a/backend/alembic/versions/2023_01_26_1835-c84fcd6900dc_add_task_created_date_index.py b/backend/alembic/versions/2023_01_26_1835-c84fcd6900dc_add_task_created_date_index.py new file mode 100644 index 00000000..29fd1aec --- /dev/null +++ b/backend/alembic/versions/2023_01_26_1835-c84fcd6900dc_add_task_created_date_index.py @@ -0,0 +1,26 @@ +"""add task created date index + +Revision ID: c84fcd6900dc +Revises: 40ed93df0ed5 +Create Date: 2023-01-26 18:35:43.061589 + +""" +from alembic import op + +# revision identifiers, used by Alembic. +revision = "c84fcd6900dc" +down_revision = "40ed93df0ed5" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_index(op.f("ix_task_created_date"), "task", ["created_date"], unique=False) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f("ix_task_created_date"), table_name="task") + # ### end Alembic commands ### diff --git a/backend/alembic/versions/2023_01_27_2013-f856bf19d32b_add_user_show_on_leaderboard.py b/backend/alembic/versions/2023_01_27_2013-f856bf19d32b_add_user_show_on_leaderboard.py new file mode 100644 index 00000000..f7680ed2 --- /dev/null +++ b/backend/alembic/versions/2023_01_27_2013-f856bf19d32b_add_user_show_on_leaderboard.py @@ -0,0 +1,29 @@ +"""add user.show_on_leaderboard + +Revision ID: f856bf19d32b +Revises: c84fcd6900dc +Create Date: 2023-01-27 20:13:56.533374 + +""" +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "f856bf19d32b" +down_revision = "c84fcd6900dc" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column( + "user", sa.Column("show_on_leaderboard", sa.Boolean(), server_default=sa.text("true"), nullable=False) + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column("user", "show_on_leaderboard") + # ### end Alembic commands ### diff --git a/backend/main.py b/backend/main.py index 147f8e30..ea4b25da 100644 --- a/backend/main.py +++ b/backend/main.py @@ -273,6 +273,38 @@ def get_openapi_schema(): return json.dumps(app.openapi()) +def export_ready_trees(file: Optional[str] = None, use_compression: bool = False): + try: + with Session(engine) as db: + api_client = api_auth(settings.OFFICIAL_WEB_API_KEY, db=db) + dummy_user = protocol_schema.User(id="__dummy_user__", display_name="Dummy User", auth_method="local") + + ur = UserRepository(db=db, api_client=api_client) + tr = TaskRepository(db=db, api_client=api_client, client_user=dummy_user, user_repository=ur) + pr = PromptRepository( + db=db, api_client=api_client, client_user=dummy_user, user_repository=ur, task_repository=tr + ) + tm = TreeManager(db, pr) + + tm.export_all_ready_trees(file, use_compression=use_compression) + except Exception: + logger.exception("Error exporting trees.") + + +def retry_scoring_failed_message_trees(): + try: + logger.info("TreeManager.retry_scoring_failed_message_trees()") + with Session(engine) as db: + api_client = api_auth(settings.OFFICIAL_WEB_API_KEY, db=db) + + pr = PromptRepository(db=db, api_client=api_client) + tm = TreeManager(db, pr) + tm.retry_scoring_failed_message_trees() + + except Exception: + logger.exception("TreeManager.retry_scoring_failed_message_trees() failed.") + + def main(): # Importing here so we don't import packages unnecessarily if we're # importing main as a module. @@ -289,11 +321,28 @@ def main(): ) parser.add_argument("--host", help="The host to run the server", default="0.0.0.0") parser.add_argument("--port", help="The port to run the server", default=8080) + parser.add_argument( + "--export", help="Export all trees which are ready for exporting.", action=argparse.BooleanOptionalAction + ) + parser.add_argument( + "--export-file", + help="Name of file to export trees to. If not provided when exporting, output will be send to STDOUT", + ) + parser.add_argument( + "--retry-scoring", + help="Retry scoring failed message trees", + action=argparse.BooleanOptionalAction, + ) args = parser.parse_args() if args.print_openapi_schema: print(get_openapi_schema()) + elif args.export: + use_compression: bool = ".gz" in args.export_file + export_ready_trees(file=args.export_file, use_compression=use_compression) + elif args.retry_scoring: + retry_scoring_failed_message_trees() else: uvicorn.run(app, host=args.host, port=args.port) diff --git a/backend/oasst_backend/api/deps.py b/backend/oasst_backend/api/deps.py index 985b72a0..dd10bd20 100644 --- a/backend/oasst_backend/api/deps.py +++ b/backend/oasst_backend/api/deps.py @@ -1,6 +1,6 @@ from http import HTTPStatus from secrets import token_hex -from typing import Generator +from typing import Generator, NamedTuple from fastapi import Depends, Request, Response, Security from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer @@ -19,22 +19,46 @@ def get_db() -> Generator: yield db -api_key_query = APIKeyQuery(name="api_key", auto_error=False) -api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False) +api_key_query = APIKeyQuery(name="api_key", scheme_name="api-key", auto_error=False) +api_key_header = APIKeyHeader(name="X-API-Key", scheme_name="api-key", auto_error=False) +oasst_user_query = APIKeyQuery(name="oasst_user", scheme_name="oasst-user", auto_error=False) +oasst_user_header = APIKeyHeader(name="x-oasst-user", scheme_name="oasst-user", auto_error=False) bearer_token = HTTPBearer(auto_error=False) -async def get_api_key( +def get_api_key( api_key_query: str = Security(api_key_query), api_key_header: str = Security(api_key_header), -): +) -> str: if api_key_query: return api_key_query else: return api_key_header +class FrontendUserId(NamedTuple): + auth_method: str + username: str + + +def get_frontend_user_id( + user_query: str = Security(oasst_user_query), + user_header: str = Security(oasst_user_header), +) -> FrontendUserId: + def split_user(v: str) -> tuple[str, str]: + if type(v) is str: + v = v.split(":", maxsplit=1) + if len(v) == 2: + return FrontendUserId(auth_method=v[0], username=v[1]) + return FrontendUserId(auth_method=None, username=None) + + if user_query: + return split_user(user_query) + else: + return split_user(user_header) + + def create_api_client( *, session: Session, diff --git a/backend/oasst_backend/api/v1/frontend_users.py b/backend/oasst_backend/api/v1/frontend_users.py index 5d96aa90..a4ca6380 100644 --- a/backend/oasst_backend/api/v1/frontend_users.py +++ b/backend/oasst_backend/api/v1/frontend_users.py @@ -70,13 +70,14 @@ def query_frontend_user_messages( only_roots: bool = False, desc: bool = True, include_deleted: bool = False, + lang: Optional[str] = None, api_client: ApiClient = Depends(deps.get_api_client), db: Session = Depends(deps.get_db), ): """ Query frontend user messages. """ - pr = PromptRepository(db, api_client) + pr = PromptRepository(db, api_client, auth_method=auth_method, username=username) messages = pr.query_messages_ordered_by_created_date( auth_method=auth_method, username=username, @@ -87,6 +88,7 @@ def query_frontend_user_messages( lte_created_date=end_date, only_roots=only_roots, deleted=None if include_deleted else False, + lang=lang, ) return utils.prepare_message_list(messages) @@ -95,24 +97,28 @@ def query_frontend_user_messages( def query_frontend_user_messages_cursor( auth_method: str, username: str, - lt: Optional[str] = None, - gt: Optional[str] = None, + before: Optional[str] = None, + after: Optional[str] = None, only_roots: Optional[bool] = False, include_deleted: Optional[bool] = False, max_count: Optional[int] = Query(10, gt=0, le=1000), desc: Optional[bool] = False, + lang: Optional[str] = None, + frontend_user: deps.FrontendUserId = Depends(deps.get_frontend_user_id), api_client: ApiClient = Depends(deps.get_api_client), db: Session = Depends(deps.get_db), ): return get_messages_cursor( - lt=lt, - gt=gt, + before=before, + after=after, auth_method=auth_method, username=username, only_roots=only_roots, include_deleted=include_deleted, max_count=max_count, desc=desc, + lang=lang, + frontend_user=frontend_user, api_client=api_client, db=db, ) diff --git a/backend/oasst_backend/api/v1/messages.py b/backend/oasst_backend/api/v1/messages.py index 1ef1e929..1fbaf53c 100644 --- a/backend/oasst_backend/api/v1/messages.py +++ b/backend/oasst_backend/api/v1/messages.py @@ -7,6 +7,7 @@ from oasst_backend.api import deps from oasst_backend.api.v1 import utils from oasst_backend.models import ApiClient from oasst_backend.prompt_repository import PromptRepository +from oasst_backend.utils.database_utils import CommitMode, managed_tx_function from oasst_shared.exceptions.oasst_api_error import OasstError, OasstErrorCode from oasst_shared.schemas import protocol from sqlmodel import Session @@ -17,6 +18,7 @@ router = APIRouter() @router.get("/", response_model=list[protocol.Message]) def query_messages( + *, auth_method: Optional[str] = None, username: Optional[str] = None, api_client_id: Optional[str] = None, @@ -26,13 +28,15 @@ def query_messages( only_roots: Optional[bool] = False, desc: Optional[bool] = True, allow_deleted: Optional[bool] = False, + lang: Optional[str] = None, + frontend_user: deps.FrontendUserId = Depends(deps.get_frontend_user_id), api_client: ApiClient = Depends(deps.get_api_client), db: Session = Depends(deps.get_db), ): """ Query messages. """ - pr = PromptRepository(db, api_client) + pr = PromptRepository(db, api_client, auth_method=frontend_user.auth_method, username=frontend_user.username) messages = pr.query_messages_ordered_by_created_date( auth_method=auth_method, username=username, @@ -43,6 +47,7 @@ def query_messages( lte_created_date=end_date, only_roots=only_roots, deleted=None if allow_deleted else False, + lang=lang, ) return utils.prepare_message_list(messages) @@ -50,8 +55,9 @@ def query_messages( @router.get("/cursor", response_model=protocol.MessagePage) def get_messages_cursor( - lt: Optional[str] = None, - gt: Optional[str] = None, + *, + before: Optional[str] = None, + after: Optional[str] = None, user_id: Optional[UUID] = None, auth_method: Optional[str] = None, username: Optional[str] = None, @@ -60,9 +66,13 @@ def get_messages_cursor( include_deleted: Optional[bool] = False, max_count: Optional[int] = Query(10, gt=0, le=1000), desc: Optional[bool] = False, + lang: Optional[str] = None, + frontend_user: deps.FrontendUserId = Depends(deps.get_frontend_user_id), api_client: ApiClient = Depends(deps.get_api_client), db: Session = Depends(deps.get_db), ): + assert max_count is not None + def split_cursor(x: str | None) -> tuple[datetime, UUID]: if not x: return None, None @@ -74,11 +84,21 @@ def get_messages_cursor( except ValueError: raise OasstError("Invalid cursor value", OasstErrorCode.INVALID_CURSOR_VALUE) - lte_created_date, lt_id = split_cursor(lt) - gte_created_date, gt_id = split_cursor(gt) + if desc: + gte_created_date, gt_id = split_cursor(before) + lte_created_date, lt_id = split_cursor(after) + query_desc = not (before is not None and not after) + else: + lte_created_date, lt_id = split_cursor(before) + gte_created_date, gt_id = split_cursor(after) + query_desc = before is not None and not after - pr = PromptRepository(db, api_client) - messages = pr.query_messages_ordered_by_created_date( + print(f"{desc=} {query_desc=} {gte_created_date=} {lte_created_date=}") + + qry_max_count = max_count + 1 if before is None or after is None else max_count + + pr = PromptRepository(db, api_client, frontend_user=frontend_user) + items = pr.query_messages_ordered_by_created_date( user_id=user_id, auth_method=auth_method, username=username, @@ -89,22 +109,31 @@ def get_messages_cursor( lt_id=lt_id, only_roots=only_roots, deleted=None if include_deleted else False, - desc=desc, - limit=max_count, + desc=query_desc, + limit=qry_max_count, + lang=lang, ) - items = utils.prepare_message_list(messages) + num_rows = len(items) + if qry_max_count > max_count and num_rows == qry_max_count: + assert not (before and after) + items = items[:-1] + + if desc != query_desc: + items.reverse() + + items = utils.prepare_message_list(items) n, p = None, None if len(items) > 0: - if len(items) == max_count or gte_created_date: + if (num_rows > max_count and before) or after: p = str(items[0].id) + "$" + items[0].created_date.isoformat() - if len(items) == max_count or lte_created_date: + if num_rows > max_count or before: n = str(items[-1].id) + "$" + items[-1].created_date.isoformat() else: - if gte_created_date: - p = gte_created_date.isoformat() - if lte_created_date: - n = lte_created_date.isoformat() + if after: + p = lte_created_date.isoformat() if desc else gte_created_date.isoformat() + if before: + n = gte_created_date.isoformat() if desc else lte_created_date.isoformat() order = "desc" if desc else "asc" return protocol.MessagePage(prev=p, next=n, sort_key="created_date", order=order, items=items) @@ -112,37 +141,49 @@ def get_messages_cursor( @router.get("/{message_id}", response_model=protocol.Message) def get_message( - message_id: UUID, api_client: ApiClient = Depends(deps.get_api_client), db: Session = Depends(deps.get_db) + *, + message_id: UUID, + frontend_user: deps.FrontendUserId = Depends(deps.get_frontend_user_id), + api_client: ApiClient = Depends(deps.get_api_client), + db: Session = Depends(deps.get_db), ): """ Get a message by its internal ID. """ - pr = PromptRepository(db, api_client) + pr = PromptRepository(db, api_client, frontend_user=frontend_user) message = pr.fetch_message(message_id) return utils.prepare_message(message) @router.get("/{message_id}/conversation", response_model=protocol.Conversation) def get_conv( - message_id: UUID, api_client: ApiClient = Depends(deps.get_api_client), db: Session = Depends(deps.get_db) + *, + message_id: UUID, + frontend_user: deps.FrontendUserId = Depends(deps.get_frontend_user_id), + api_client: ApiClient = Depends(deps.get_api_client), + db: Session = Depends(deps.get_db), ): """ Get a conversation from the tree root and up to the message with given internal ID. """ - pr = PromptRepository(db, api_client) + pr = PromptRepository(db, api_client, frontend_user=frontend_user) messages = pr.fetch_message_conversation(message_id) return utils.prepare_conversation(messages) @router.get("/{message_id}/tree", response_model=protocol.MessageTree) def get_tree( - message_id: UUID, api_client: ApiClient = Depends(deps.get_api_client), db: Session = Depends(deps.get_db) + *, + message_id: UUID, + frontend_user: deps.FrontendUserId = Depends(deps.get_frontend_user_id), + api_client: ApiClient = Depends(deps.get_api_client), + db: Session = Depends(deps.get_db), ): """ Get all messages belonging to the same message tree. """ - pr = PromptRepository(db, api_client) + pr = PromptRepository(db, api_client, frontend_user=frontend_user) message = pr.fetch_message(message_id) tree = pr.fetch_message_tree(message.message_tree_id, reviewed=False) return utils.prepare_tree(tree, message.message_tree_id) @@ -150,24 +191,32 @@ def get_tree( @router.get("/{message_id}/children", response_model=list[protocol.Message]) def get_children( - message_id: UUID, api_client: ApiClient = Depends(deps.get_api_client), db: Session = Depends(deps.get_db) + *, + message_id: UUID, + frontend_user: deps.FrontendUserId = Depends(deps.get_frontend_user_id), + api_client: ApiClient = Depends(deps.get_api_client), + db: Session = Depends(deps.get_db), ): """ Get all messages belonging to the same message tree. """ - pr = PromptRepository(db, api_client) + pr = PromptRepository(db, api_client, frontend_user=frontend_user) messages = pr.fetch_message_children(message_id) return utils.prepare_message_list(messages) @router.get("/{message_id}/descendants", response_model=protocol.MessageTree) def get_descendants( - message_id: UUID, api_client: ApiClient = Depends(deps.get_api_client), db: Session = Depends(deps.get_db) + *, + message_id: UUID, + frontend_user: deps.FrontendUserId = Depends(deps.get_frontend_user_id), + api_client: ApiClient = Depends(deps.get_api_client), + db: Session = Depends(deps.get_db), ): """ Get a subtree which starts with this message. """ - pr = PromptRepository(db, api_client) + pr = PromptRepository(db, api_client, frontend_user=frontend_user) message = pr.fetch_message(message_id) descendants = pr.fetch_message_descendants(message) return utils.prepare_tree(descendants, message.id) @@ -175,12 +224,16 @@ def get_descendants( @router.get("/{message_id}/longest_conversation_in_tree", response_model=protocol.Conversation) def get_longest_conv( - message_id: UUID, api_client: ApiClient = Depends(deps.get_api_client), db: Session = Depends(deps.get_db) + *, + message_id: UUID, + frontend_user: deps.FrontendUserId = Depends(deps.get_frontend_user_id), + api_client: ApiClient = Depends(deps.get_api_client), + db: Session = Depends(deps.get_db), ): """ Get the longest conversation from the tree of the message. """ - pr = PromptRepository(db, api_client) + pr = PromptRepository(db, api_client, frontend_user=frontend_user) message = pr.fetch_message(message_id) conv = pr.fetch_longest_conversation(message.message_tree_id) return utils.prepare_conversation(conv) @@ -188,12 +241,16 @@ def get_longest_conv( @router.get("/{message_id}/max_children_in_tree", response_model=protocol.MessageTree) def get_max_children( - message_id: UUID, api_client: ApiClient = Depends(deps.get_api_client), db: Session = Depends(deps.get_db) + *, + message_id: UUID, + frontend_user: deps.FrontendUserId = Depends(deps.get_frontend_user_id), + api_client: ApiClient = Depends(deps.get_api_client), + db: Session = Depends(deps.get_db), ): """ Get message with the most children from the tree of the provided message. """ - pr = PromptRepository(db, api_client) + pr = PromptRepository(db, api_client, frontend_user=frontend_user) message = pr.fetch_message(message_id) message, children = pr.fetch_message_with_max_children(message.message_tree_id) return utils.prepare_tree([message, *children], message.id) @@ -201,7 +258,30 @@ def get_max_children( @router.delete("/{message_id}", status_code=HTTP_204_NO_CONTENT) def mark_message_deleted( - message_id: UUID, api_client: ApiClient = Depends(deps.get_trusted_api_client), db: Session = Depends(deps.get_db) + *, + message_id: UUID, + frontend_user: deps.FrontendUserId = Depends(deps.get_frontend_user_id), + api_client: ApiClient = Depends(deps.get_trusted_api_client), + db: Session = Depends(deps.get_db), ): - pr = PromptRepository(db, api_client) + pr = PromptRepository(db, api_client, frontend_user=frontend_user) pr.mark_messages_deleted(message_id) + + +@router.post("/{message_id}/emoji", response_model=protocol.Message) +def post_message_emoji( + *, + message_id: UUID, + request: protocol.MessageEmojiRequest, + api_client: ApiClient = Depends(deps.get_api_client), +) -> protocol.Message: + """ + Toggle, add or remove message emoji. + """ + + @managed_tx_function(CommitMode.COMMIT) + def emoji_tx(session: deps.Session): + pr = PromptRepository(session, api_client, client_user=request.user) + return pr.handle_message_emoji(message_id, request.op, request.emoji) + + return utils.prepare_message(emoji_tx()) diff --git a/backend/oasst_backend/api/v1/tasks.py b/backend/oasst_backend/api/v1/tasks.py index 7875ac83..ae498bbb 100644 --- a/backend/oasst_backend/api/v1/tasks.py +++ b/backend/oasst_backend/api/v1/tasks.py @@ -77,6 +77,7 @@ def tasks_acknowledge( *, db: Session = Depends(deps.get_db), api_key: APIKey = Depends(deps.get_api_key), + frontend_user: deps.FrontendUserId = Depends(deps.get_frontend_user_id), task_id: UUID, ack_request: protocol_schema.TaskAck, ) -> None: @@ -87,7 +88,7 @@ def tasks_acknowledge( api_client = deps.api_auth(api_key, db) try: - pr = PromptRepository(db, api_client) + pr = PromptRepository(db, api_client, frontend_user=frontend_user) # here we store the message id in the database for the task logger.info(f"Frontend acknowledges task {task_id=}, {ack_request=}.") @@ -105,6 +106,7 @@ def tasks_acknowledge_failure( *, db: Session = Depends(deps.get_db), api_key: APIKey = Depends(deps.get_api_key), + frontend_user: deps.FrontendUserId = Depends(deps.get_frontend_user_id), task_id: UUID, nack_request: protocol_schema.TaskNAck, ) -> None: @@ -115,7 +117,7 @@ def tasks_acknowledge_failure( try: logger.info(f"Frontend reports failure to implement task {task_id=}, {nack_request=}.") api_client = deps.api_auth(api_key, db) - pr = PromptRepository(db, api_client) + pr = PromptRepository(db, api_client, frontend_user=frontend_user) pr.task_repository.acknowledge_task_failure(task_id) except (KeyError, RuntimeError): logger.exception("Failed to not acknowledge task.") diff --git a/backend/oasst_backend/api/v1/text_labels.py b/backend/oasst_backend/api/v1/text_labels.py index c9afd88c..2025fd4c 100644 --- a/backend/oasst_backend/api/v1/text_labels.py +++ b/backend/oasst_backend/api/v1/text_labels.py @@ -3,9 +3,11 @@ from fastapi.security.api_key import APIKey from loguru import logger from oasst_backend.api import deps from oasst_backend.prompt_repository import PromptRepository -from oasst_backend.schemas.text_labels import LabelOption, ValidLabelsResponse +from oasst_backend.schemas.text_labels import LabelDescription, ValidLabelsResponse +from oasst_backend.utils.database_utils import CommitMode, managed_tx_function +from oasst_shared.exceptions import OasstError from oasst_shared.schemas import protocol as protocol_schema -from sqlmodel import Session +from oasst_shared.schemas.protocol import TextLabel from starlette.status import HTTP_204_NO_CONTENT, HTTP_400_BAD_REQUEST router = APIRouter() @@ -14,20 +16,25 @@ router = APIRouter() @router.post("/", status_code=HTTP_204_NO_CONTENT) def label_text( *, - db: Session = Depends(deps.get_db), api_key: APIKey = Depends(deps.get_api_key), text_labels: protocol_schema.TextLabels, ) -> None: """ Label a piece of text. """ - api_client = deps.api_auth(api_key, db) + + @managed_tx_function(CommitMode.COMMIT) + def store_text_labels(session: deps.Session): + api_client = deps.api_auth(api_key, session) + pr = PromptRepository(session, api_client, client_user=text_labels.user) + pr.store_text_labels(text_labels) try: logger.info(f"Labeling text {text_labels=}.") - pr = PromptRepository(db, api_client, client_user=text_labels.user) - pr.store_text_labels(text_labels) + store_text_labels() + except OasstError: + raise except Exception: logger.exception("Failed to store label.") raise HTTPException( @@ -39,7 +46,29 @@ def label_text( def get_valid_lables() -> ValidLabelsResponse: return ValidLabelsResponse( valid_labels=[ - LabelOption(name=l.value, display_text=l.display_text, help_text=l.help_text) - for l in protocol_schema.TextLabel + LabelDescription(name=l.value, widget=l.widget.value, display_text=l.display_text, help_text=l.help_text) + for l in TextLabel + ] + ) + + +@router.get("/report_labels") +def get_report_lables() -> ValidLabelsResponse: + report_labels = [ + TextLabel.spam, + TextLabel.not_appropriate, + TextLabel.pii, + TextLabel.hate_speech, + TextLabel.sexual_content, + TextLabel.moral_judgement, + TextLabel.political_content, + TextLabel.toxicity, + TextLabel.violence, + TextLabel.quality, + ] + return ValidLabelsResponse( + valid_labels=[ + LabelDescription(name=l.value, widget=l.widget.value, display_text=l.display_text, help_text=l.help_text) + for l in report_labels ] ) diff --git a/backend/oasst_backend/api/v1/users.py b/backend/oasst_backend/api/v1/users.py index ed07eb21..dc0a3242 100644 --- a/backend/oasst_backend/api/v1/users.py +++ b/backend/oasst_backend/api/v1/users.py @@ -28,6 +28,7 @@ def get_users_ordered_by_username( search_text: Optional[str] = None, auth_method: Optional[str] = None, max_count: Optional[int] = Query(100, gt=0, le=10000), + desc: Optional[bool] = False, api_client: ApiClient = Depends(deps.get_api_client), db: Session = Depends(deps.get_db), ): @@ -41,6 +42,7 @@ def get_users_ordered_by_username( auth_method=auth_method, search_text=search_text, limit=max_count, + desc=desc, ) return [u.to_protocol_frontend_user() for u in users] @@ -55,6 +57,7 @@ def get_users_ordered_by_display_name( auth_method: Optional[str] = None, search_text: Optional[str] = None, max_count: Optional[int] = Query(100, gt=0, le=10000), + desc: Optional[bool] = False, api_client: ApiClient = Depends(deps.get_api_client), db: Session = Depends(deps.get_db), ): @@ -68,14 +71,15 @@ def get_users_ordered_by_display_name( auth_method=auth_method, search_text=search_text, limit=max_count, + desc=desc, ) return [u.to_protocol_frontend_user() for u in users] @router.get("/cursor", response_model=protocol.FrontEndUserPage) def get_users_cursor( - lt: Optional[str] = None, - gt: Optional[str] = None, + before: Optional[str] = None, + after: Optional[str] = None, sort_key: Optional[str] = Query("username", max_length=32), max_count: Optional[int] = Query(100, gt=0, le=10000), api_client_id: Optional[UUID] = None, @@ -95,7 +99,8 @@ def get_users_cursor( return x, None items: list[protocol.FrontEndUser] - qry_max_count = max_count + 1 if lt is None or gt is None else max_count + qry_max_count = max_count + 1 if before is None or after is None else max_count + desc = before is not None and not after def get_next_prev(num_rows: int, lt: str | None, gt: str | None, key_fn: Callable[[protocol.FrontEndUser], str]): p, n = None, None @@ -114,17 +119,16 @@ def get_users_cursor( def remove_extra_item(items: list[protocol.FrontEndUser], lt: str | None, gt: str | None): num_rows = len(items) if qry_max_count > max_count and num_rows == qry_max_count: - assert not (lt and gt) - if lt: - items = items[1:] - else: - items = items[:-1] + assert not (lt is not None and gt is not None) + items = items[:-1] + if desc: + items.reverse() return items, num_rows n, p = None, None if sort_key == "username": - lte_username, lt_id = split_cursor(lt) - gte_username, gt_id = split_cursor(gt) + lte_username, lt_id = split_cursor(before) + gte_username, gt_id = split_cursor(after) items = get_users_ordered_by_username( api_client_id=api_client_id, gte_username=gte_username, @@ -134,6 +138,7 @@ def get_users_cursor( auth_method=auth_method, search_text=search_text, max_count=qry_max_count, + desc=desc, api_client=api_client, db=db, ) @@ -141,8 +146,8 @@ def get_users_cursor( p, n = get_next_prev(num_rows, lte_username, gte_username, lambda x: x.id) elif sort_key == "display_name": - lte_display_name, lt_id = split_cursor(lt) - gte_display_name, gt_id = split_cursor(gt) + lte_display_name, lt_id = split_cursor(before) + gte_display_name, gt_id = split_cursor(after) items = get_users_ordered_by_display_name( api_client_id=api_client_id, gte_display_name=gte_display_name, @@ -152,6 +157,7 @@ def get_users_cursor( auth_method=auth_method, search_text=search_text, max_count=qry_max_count, + desc=desc, api_client=api_client, db=db, ) @@ -184,6 +190,7 @@ def update_user( user_id: UUID, enabled: Optional[bool] = None, notes: Optional[str] = None, + show_on_leaderboard: Optional[bool] = None, db: Session = Depends(deps.get_db), api_client: ApiClient = Depends(deps.get_trusted_api_client), ): @@ -191,7 +198,7 @@ def update_user( Update a user by global user ID. Only trusted clients can update users. """ ur = UserRepository(db, api_client) - ur.update_user(user_id, enabled, notes) + ur.update_user(user_id, enabled, notes, show_on_leaderboard) @router.delete("/{user_id}", status_code=HTTP_204_NO_CONTENT) @@ -217,13 +224,15 @@ def query_user_messages( only_roots: bool = False, desc: bool = True, include_deleted: bool = False, + lang: Optional[str] = None, + frontend_user: deps.FrontendUserId = Depends(deps.get_frontend_user_id), api_client: ApiClient = Depends(deps.get_api_client), db: Session = Depends(deps.get_db), ): """ Query user messages. """ - pr = PromptRepository(db, api_client) + pr = PromptRepository(db, api_client, frontend_user=frontend_user) messages = pr.query_messages_ordered_by_created_date( user_id=user_id, api_client_id=api_client_id, @@ -233,6 +242,7 @@ def query_user_messages( lte_created_date=end_date, only_roots=only_roots, deleted=None if include_deleted else False, + lang=lang, ) return utils.prepare_message_list(messages) @@ -241,23 +251,27 @@ def query_user_messages( @router.get("/{user_id}/messages/cursor", response_model=protocol.MessagePage) def query_user_messages_cursor( user_id: Optional[UUID], - lt: Optional[str] = None, - gt: Optional[str] = None, + before: Optional[str] = None, + after: Optional[str] = None, only_roots: Optional[bool] = False, include_deleted: Optional[bool] = False, max_count: Optional[int] = Query(10, gt=0, le=1000), desc: Optional[bool] = False, + lang: Optional[str] = None, + frontend_user: deps.FrontendUserId = Depends(deps.get_frontend_user_id), api_client: ApiClient = Depends(deps.get_api_client), db: Session = Depends(deps.get_db), ): return get_messages_cursor( - lt=lt, - gt=gt, + before=before, + after=after, user_id=user_id, only_roots=only_roots, include_deleted=include_deleted, max_count=max_count, desc=desc, + lang=lang, + frontend_user=frontend_user, api_client=api_client, db=db, ) @@ -265,9 +279,12 @@ def query_user_messages_cursor( @router.delete("/{user_id}/messages", status_code=HTTP_204_NO_CONTENT) def mark_user_messages_deleted( - user_id: UUID, api_client: ApiClient = Depends(deps.get_trusted_api_client), db: Session = Depends(deps.get_db) + user_id: UUID, + frontend_user: deps.FrontendUserId = Depends(deps.get_frontend_user_id), + api_client: ApiClient = Depends(deps.get_trusted_api_client), + db: Session = Depends(deps.get_db), ): - pr = PromptRepository(db, api_client) + pr = PromptRepository(db, api_client, frontend_user=frontend_user) messages = pr.query_messages_ordered_by_created_date(user_id=user_id, limit=None) pr.mark_messages_deleted(messages) diff --git a/backend/oasst_backend/api/v1/utils.py b/backend/oasst_backend/api/v1/utils.py index 99161e32..b9e982ab 100644 --- a/backend/oasst_backend/api/v1/utils.py +++ b/backend/oasst_backend/api/v1/utils.py @@ -14,6 +14,8 @@ def prepare_message(m: Message) -> protocol.Message: lang=m.lang, is_assistant=(m.role == "assistant"), created_date=m.created_date, + emojis=m.emojis or {}, + user_emojis=m.user_emojis or [], ) @@ -21,17 +23,20 @@ def prepare_message_list(messages: list[Message]) -> list[protocol.Message]: return [prepare_message(m) for m in messages] +def prepare_conversation_message(message: Message) -> protocol.ConversationMessage: + return protocol.ConversationMessage( + id=message.id, + frontend_message_id=message.frontend_message_id, + text=message.text, + lang=message.lang, + is_assistant=(message.role == "assistant"), + emojis=message.emojis or {}, + user_emojis=message.user_emojis or [], + ) + + def prepare_conversation_message_list(messages: list[Message]) -> list[protocol.ConversationMessage]: - return [ - protocol.ConversationMessage( - id=message.id, - frontend_message_id=message.frontend_message_id, - text=message.text, - lang=message.lang, - is_assistant=(message.role == "assistant"), - ) - for message in messages - ] + return [prepare_conversation_message(message) for message in messages] def prepare_conversation(messages: list[Message]) -> protocol.Conversation: diff --git a/backend/oasst_backend/config.py b/backend/oasst_backend/config.py index a6ed1d60..209e6fe6 100644 --- a/backend/oasst_backend/config.py +++ b/backend/oasst_backend/config.py @@ -1,7 +1,7 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Union -from oasst_shared.schemas import protocol as protocol_schema +from oasst_shared.schemas.protocol import TextLabel from pydantic import AnyHttpUrl, BaseModel, BaseSettings, FilePath, PostgresDsn, validator @@ -46,17 +46,69 @@ class TreeManagerConfiguration(BaseModel): num_required_rankings: int = 3 """Number of rankings in which the message participated.""" - mandatory_labels_initial_prompt: Optional[list[protocol_schema.TextLabel]] = [protocol_schema.TextLabel.spam] + labels_initial_prompt: list[TextLabel] = [ + TextLabel.spam, + TextLabel.quality, + TextLabel.helpfulness, + TextLabel.creativity, + TextLabel.humor, + TextLabel.toxicity, + TextLabel.violence, + TextLabel.not_appropriate, + TextLabel.pii, + TextLabel.hate_speech, + TextLabel.sexual_content, + ] + + labels_assistant_reply: list[TextLabel] = [ + TextLabel.spam, + TextLabel.fails_task, + TextLabel.quality, + TextLabel.helpfulness, + TextLabel.creativity, + TextLabel.humor, + TextLabel.toxicity, + TextLabel.violence, + TextLabel.not_appropriate, + TextLabel.pii, + TextLabel.hate_speech, + TextLabel.sexual_content, + ] + + labels_prompter_reply: list[TextLabel] = [ + TextLabel.spam, + TextLabel.quality, + TextLabel.helpfulness, + TextLabel.humor, + TextLabel.creativity, + TextLabel.toxicity, + TextLabel.violence, + TextLabel.not_appropriate, + TextLabel.pii, + TextLabel.hate_speech, + TextLabel.sexual_content, + ] + + mandatory_labels_initial_prompt: Optional[list[TextLabel]] = [TextLabel.spam] """Mandatory labels in text-labeling tasks for initial prompts.""" - mandatory_labels_assistant_reply: Optional[list[protocol_schema.TextLabel]] = [protocol_schema.TextLabel.spam] + mandatory_labels_assistant_reply: Optional[list[TextLabel]] = [TextLabel.spam] """Mandatory labels in text-labeling tasks for assistant replies.""" - mandatory_labels_prompter_reply: Optional[list[protocol_schema.TextLabel]] = [protocol_schema.TextLabel.spam] + mandatory_labels_prompter_reply: Optional[list[TextLabel]] = [TextLabel.spam] """Mandatory labels in text-labeling tasks for prompter replies.""" rank_prompter_replies: bool = False + lonely_children_count: int = 3 + """Number of children below which parents are preferred during sampling for reply tasks.""" + + p_lonely_child_extension: float = 0.8 + """Probability to select a parent with less than lonely_children_count children.""" + + recent_tasks_span_sec: int = 3 * 60 # 3 min + """Time in seconds of recent tasks to consider for exclusion during task selection.""" + class Settings(BaseSettings): PROJECT_NAME: str = "open-assistant backend" @@ -90,10 +142,13 @@ class Settings(BaseSettings): Path(__file__).parent.parent / "test_data/realistic/realistic_seed_data.json" ) DEBUG_ALLOW_SELF_LABELING: bool = False # allow users to label their own messages + DEBUG_ALLOW_DUPLICATE_TASKS: bool = False # offer users tasks to which they already responded DEBUG_SKIP_EMBEDDING_COMPUTATION: bool = False DEBUG_SKIP_TOXICITY_CALCULATION: bool = False DEBUG_DATABASE_ECHO: bool = False + DUPLICATE_MESSAGE_FILTER_WINDOW_MINUTES: int = 120 + HUGGING_FACE_API_KEY: str = "" ROOT_TOKENS: List[str] = ["1234"] # supply a string that can be parsed to a json list diff --git a/backend/oasst_backend/crud/__init__.py b/backend/oasst_backend/crud/__init__.py deleted file mode 100644 index a9a2c5b3..00000000 --- a/backend/oasst_backend/crud/__init__.py +++ /dev/null @@ -1 +0,0 @@ -__all__ = [] diff --git a/backend/oasst_backend/crud/base.py b/backend/oasst_backend/crud/base.py deleted file mode 100644 index 432d029d..00000000 --- a/backend/oasst_backend/crud/base.py +++ /dev/null @@ -1,56 +0,0 @@ -from typing import Any, Dict, Generic, List, Optional, Type, TypeVar, Union - -from fastapi.encoders import jsonable_encoder -from pydantic import BaseModel -from sqlmodel import Session, SQLModel - -ModelType = TypeVar("ModelType", bound=SQLModel) -CreateSchemaType = TypeVar("CreateSchemaType", bound=BaseModel) -UpdateSchemaType = TypeVar("UpdateSchemaType", bound=BaseModel) - - -class CRUDBase(Generic[ModelType, CreateSchemaType, UpdateSchemaType]): - def __init__(self, model: Type[ModelType]): - """ - CRUD object with default methods to Create, Read, Update, Delete (CRUD). - - **Parameters** - - * `model`: A SQLModel model class - * `schema`: A Pydantic model (schema) class - """ - self.model = model - - def get(self, db: Session, id: Any) -> Optional[ModelType]: - return db.query(self.model).filter(self.model.id == id).first() - - def get_multi(self, db: Session, *, begin_id: int = 0, limit: int = 100) -> List[ModelType]: - return db.query(self.model).filter(self.model.id >= begin_id).limit(limit).all() - - def create(self, db: Session, *, obj_in: CreateSchemaType) -> ModelType: - obj_in_data = jsonable_encoder(obj_in) - db_obj = self.model(**obj_in_data) # type: ignore - db.add(db_obj) - db.commit() - db.refresh(db_obj) - return db_obj - - def update(self, db: Session, *, db_obj: ModelType, obj_in: Union[UpdateSchemaType, Dict[str, Any]]) -> ModelType: - obj_data = jsonable_encoder(db_obj) - if isinstance(obj_in, dict): - update_data = obj_in - else: - update_data = obj_in.dict(exclude_unset=True) - for field in obj_data: - if field in update_data: - setattr(db_obj, field, update_data[field]) - db.add(db_obj) - db.commit() - db.refresh(db_obj) - return db_obj - - def delete(self, db: Session, *, id: int) -> ModelType: - obj = db.query(self.model).get(id) - db.delete(obj) - db.commit() - return obj diff --git a/backend/oasst_backend/models/__init__.py b/backend/oasst_backend/models/__init__.py index 2b30b475..420c0ccd 100644 --- a/backend/oasst_backend/models/__init__.py +++ b/backend/oasst_backend/models/__init__.py @@ -2,6 +2,7 @@ from .api_client import ApiClient from .journal import Journal, JournalIntegration from .message import Message from .message_embedding import MessageEmbedding +from .message_emoji import MessageEmoji from .message_reaction import MessageReaction from .message_toxicity import MessageToxicity from .message_tree_state import MessageTreeState @@ -24,4 +25,5 @@ __all__ = [ "TextLabels", "Journal", "JournalIntegration", + "MessageEmoji", ] diff --git a/backend/oasst_backend/models/db_payload.py b/backend/oasst_backend/models/db_payload.py index ddaf2391..b0eb38df 100644 --- a/backend/oasst_backend/models/db_payload.py +++ b/backend/oasst_backend/models/db_payload.py @@ -117,7 +117,8 @@ class LabelConversationReplyPayload(TaskPayload): message_id: UUID conversation: protocol_schema.Conversation - reply: str + reply: str # deprecated + reply_message: Optional[protocol_schema.ConversationMessage] valid_labels: list[str] mandatory_labels: Optional[list[str]] mode: Optional[protocol_schema.LabelTaskMode] diff --git a/backend/oasst_backend/models/message.py b/backend/oasst_backend/models/message.py index d0b1d869..24fafc01 100644 --- a/backend/oasst_backend/models/message.py +++ b/backend/oasst_backend/models/message.py @@ -1,12 +1,13 @@ from datetime import datetime from http import HTTPStatus -from typing import Optional +from typing import Any, Optional from uuid import UUID, uuid4 import sqlalchemy as sa import sqlalchemy.dialects.postgresql as pg from oasst_backend.models.db_payload import MessagePayload from oasst_shared.exceptions.oasst_api_error import OasstError, OasstErrorCode +from pydantic import PrivateAttr from sqlalchemy import false from sqlmodel import Field, Index, SQLModel @@ -17,6 +18,13 @@ class Message(SQLModel, table=True): __tablename__ = "message" __table_args__ = (Index("ix_message_frontend_message_id", "api_client_id", "frontend_message_id", unique=True),) + def __new__(cls, *args: Any, **kwargs: Any): + new_object = super().__new__(cls, *args, **kwargs) + # temporary fix until https://github.com/tiangolo/sqlmodel/issues/149 gets merged + if not hasattr(new_object, "_user_emojis"): + new_object._init_private_attributes() + return new_object + id: Optional[UUID] = Field( sa_column=sa.Column( pg.UUID(as_uuid=True), primary_key=True, default=uuid4, server_default=sa.text("gen_random_uuid()") @@ -49,11 +57,24 @@ class Message(SQLModel, table=True): rank: Optional[int] = Field(nullable=True) + emojis: Optional[dict[str, int]] = Field(default=None, sa_column=sa.Column(pg.JSONB), nullable=False) + _user_emojis: Optional[list[str]] = PrivateAttr(default=None) + def ensure_is_message(self) -> None: if not self.payload or not isinstance(self.payload.payload, MessagePayload): raise OasstError("Invalid message", OasstErrorCode.INVALID_MESSAGE, HTTPStatus.INTERNAL_SERVER_ERROR) + def has_emoji(self, emoji_code: str) -> bool: + return self.emojis and emoji_code in self.emojis and self.emojis[emoji_code] > 0 + + def has_user_emoji(self, emoji_code: str) -> bool: + return self._user_emojis and emoji_code in self._user_emojis + @property def text(self) -> str: self.ensure_is_message() return self.payload.payload.text + + @property + def user_emojis(self) -> str: + return self._user_emojis diff --git a/backend/oasst_backend/models/message_emoji.py b/backend/oasst_backend/models/message_emoji.py new file mode 100644 index 00000000..9e6e92fb --- /dev/null +++ b/backend/oasst_backend/models/message_emoji.py @@ -0,0 +1,27 @@ +from datetime import datetime +from typing import Optional +from uuid import UUID + +import sqlalchemy as sa +import sqlalchemy.dialects.postgresql as pg +from sqlmodel import Field, Index, SQLModel + + +class MessageEmoji(SQLModel, table=True): + __tablename__ = "message_emoji" + __table_args__ = (Index("ix_message_emoji__user_id__message_id", "user_id", "message_id", unique=False),) + + message_id: Optional[UUID] = Field( + sa_column=sa.Column( + pg.UUID(as_uuid=True), sa.ForeignKey("message.id", ondelete="CASCADE"), nullable=False, primary_key=True + ) + ) + user_id: UUID = Field( + sa_column=sa.Column( + pg.UUID(as_uuid=True), sa.ForeignKey("user.id", ondelete="CASCADE"), nullable=False, primary_key=True + ) + ) + emoji: str = Field(nullable=False, max_length=128, primary_key=True) + created_date: Optional[datetime] = Field( + sa_column=sa.Column(sa.DateTime(timezone=True), nullable=False, server_default=sa.func.current_timestamp()) + ) diff --git a/backend/oasst_backend/models/message_reaction.py b/backend/oasst_backend/models/message_reaction.py index 4c50143e..c77c4c27 100644 --- a/backend/oasst_backend/models/message_reaction.py +++ b/backend/oasst_backend/models/message_reaction.py @@ -26,3 +26,4 @@ class MessageReaction(SQLModel, table=True): payload_type: str = Field(nullable=False, max_length=200) payload: PayloadContainer = Field(sa_column=sa.Column(payload_column_type(PayloadContainer), nullable=False)) api_client_id: UUID = Field(nullable=False, foreign_key="api_client.id") + message_id: Optional[UUID] = Field(nullable=True, index=True) diff --git a/backend/oasst_backend/models/task.py b/backend/oasst_backend/models/task.py index a59f689e..7f91b157 100644 --- a/backend/oasst_backend/models/task.py +++ b/backend/oasst_backend/models/task.py @@ -20,7 +20,9 @@ class Task(SQLModel, table=True): ), ) created_date: Optional[datetime] = Field( - sa_column=sa.Column(sa.DateTime(timezone=True), nullable=False, server_default=sa.func.current_timestamp()), + sa_column=sa.Column( + sa.DateTime(timezone=True), nullable=False, index=True, server_default=sa.func.current_timestamp() + ), ) expiry_date: Optional[datetime] = Field(sa_column=sa.Column(sa.DateTime(timezone=True), nullable=True)) user_id: Optional[UUID] = Field(nullable=True, foreign_key="user.id", index=True) diff --git a/backend/oasst_backend/models/text_labels.py b/backend/oasst_backend/models/text_labels.py index 1d238ef2..aa5940b0 100644 --- a/backend/oasst_backend/models/text_labels.py +++ b/backend/oasst_backend/models/text_labels.py @@ -27,3 +27,4 @@ class TextLabels(SQLModel, table=True): sa_column=sa.Column(pg.UUID(as_uuid=True), sa.ForeignKey("message.id"), nullable=True) ) labels: dict[str, float] = Field(default={}, sa_column=sa.Column(pg.JSONB), nullable=False) + task_id: Optional[UUID] = Field(nullable=True, index=True) diff --git a/backend/oasst_backend/models/user.py b/backend/oasst_backend/models/user.py index d882a15a..6b6dced4 100644 --- a/backend/oasst_backend/models/user.py +++ b/backend/oasst_backend/models/user.py @@ -30,6 +30,7 @@ class User(SQLModel, table=True): enabled: bool = Field(sa_column=sa.Column(sa.Boolean, nullable=False, server_default=sa.true())) notes: str = Field(sa_column=sa.Column(AutoString(length=1024), nullable=False, server_default="")) deleted: bool = Field(sa_column=sa.Column(sa.Boolean, nullable=False, server_default=sa.false())) + show_on_leaderboard: bool = Field(sa_column=sa.Column(sa.Boolean, nullable=False, server_default=sa.true())) def to_protocol_frontend_user(self): return protocol.FrontEndUser( diff --git a/backend/oasst_backend/prompt_repository.py b/backend/oasst_backend/prompt_repository.py index abf5b721..01f63d26 100644 --- a/backend/oasst_backend/prompt_repository.py +++ b/backend/oasst_backend/prompt_repository.py @@ -1,18 +1,22 @@ import random +import re from collections import defaultdict -from datetime import datetime +from datetime import datetime, timedelta from http import HTTPStatus -from typing import List, Optional, Tuple +from typing import Optional from uuid import UUID, uuid4 import oasst_backend.models.db_payload as db_payload import sqlalchemy as sa from loguru import logger +from oasst_backend.api.deps import FrontendUserId +from oasst_backend.config import settings from oasst_backend.journal_writer import JournalWriter from oasst_backend.models import ( ApiClient, Message, MessageEmbedding, + MessageEmoji, MessageReaction, MessageToxicity, MessageTreeState, @@ -28,8 +32,10 @@ from oasst_backend.utils.database_utils import CommitMode, managed_tx_method from oasst_shared.exceptions import OasstError, OasstErrorCode from oasst_shared.schemas import protocol as protocol_schema from oasst_shared.schemas.protocol import SystemStats -from oasst_shared.utils import unaware_to_utc -from sqlmodel import Session, and_, func, not_, or_, text, update +from oasst_shared.utils import unaware_to_utc, utcnow +from sqlalchemy.orm import Query +from sqlalchemy.orm.attributes import flag_modified +from sqlmodel import JSON, Session, and_, func, literal_column, not_, or_, text, update from starlette.status import HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND @@ -39,14 +45,30 @@ class PromptRepository: db: Session, api_client: ApiClient, client_user: Optional[protocol_schema.User] = None, + *, user_repository: Optional[UserRepository] = None, task_repository: Optional[TaskRepository] = None, + user_id: Optional[UUID] = None, + auth_method: Optional[str] = None, + username: Optional[str] = None, + frontend_user: Optional[FrontendUserId] = None, ): self.db = db self.api_client = api_client self.user_repository = user_repository or UserRepository(db, api_client) - self.user = self.user_repository.lookup_client_user(client_user, create_missing=True) - self.user_id = self.user.id if self.user else None + + if frontend_user and not auth_method and not username: + auth_method, username = frontend_user + + if user_id: + self.user = self.user_repository.get_user(id=user_id) + self.user_id = self.user.id + elif auth_method and username: + self.user = self.user_repository.query_frontend_user(auth_method=auth_method, username=username) + self.user_id = self.user.id + else: + self.user = self.user_repository.lookup_client_user(client_user, create_missing=True) + self.user_id = self.user.id if self.user else None logger.debug(f"PromptRepository(api_client_id={self.api_client.id}, {self.user_id=})") self.task_repository = task_repository or TaskRepository( db, api_client, client_user, user_repository=self.user_repository @@ -168,6 +190,18 @@ class PromptRepository: role = None depth = 0 + # reject whitespaces match with ^\s+$ + if re.match(r"^\s+$", text): + raise OasstError("Message text is empty", OasstErrorCode.TASK_MESSAGE_TEXT_EMPTY) + + # ensure message size is below the predefined limit + if len(text) > settings.MESSAGE_SIZE_LIMIT: + logger.error(f"Message size {len(text)=} exceeds size limit of {settings.MESSAGE_SIZE_LIMIT=}.") + raise OasstError("Message size too long.", OasstErrorCode.TASK_MESSAGE_TOO_LONG) + + if self.check_users_recent_replies_for_duplicates(text): + raise OasstError("User recent messages have duplicates", OasstErrorCode.TASK_MESSAGE_DUPLICATED) + if task.parent_message_id: parent_message = self.fetch_message(task.parent_message_id) @@ -245,7 +279,7 @@ class PromptRepository: # store reaction to message reaction_payload = db_payload.RatingReactionPayload(rating=rating.rating) - reaction = self.insert_reaction(message.id, reaction_payload) + reaction = self.insert_reaction(task_id=task.id, payload=reaction_payload, message_id=message.id) if not task.collective: task.done = True self.db.add(task) @@ -255,7 +289,7 @@ class PromptRepository: return reaction @managed_tx_method(CommitMode.COMMIT) - def store_ranking(self, ranking: protocol_schema.MessageRanking) -> Tuple[MessageReaction, Task]: + def store_ranking(self, ranking: protocol_schema.MessageRanking) -> tuple[MessageReaction, Task]: # fetch task task = self.task_repository.fetch_task_by_frontend_message_id(ranking.message_id) self._validate_task(task, frontend_message_id=ranking.message_id) @@ -295,7 +329,7 @@ class PromptRepository: ranking_parent_id=task_payload.ranking_parent_id, message_tree_id=task_payload.message_tree_id, ) - reaction = self.insert_reaction(task.id, reaction_payload) + reaction = self.insert_reaction(task_id=task.id, payload=reaction_payload, message_id=parent_msg.id) self.journal.log_ranking(task, message_id=parent_msg.id, ranking=ranking.ranking) logger.info(f"Ranking {ranking.ranking} stored for task {task.id}.") @@ -313,9 +347,8 @@ class PromptRepository: reaction_payload = db_payload.RankingReactionPayload( ranking=ranking.ranking, ranked_message_ids=ranked_message_ids ) - reaction = self.insert_reaction(task.id, reaction_payload) - # TODO: resolve message_id - self.journal.log_ranking(task, message_id=None, ranking=ranking.ranking) + reaction = self.insert_reaction(task_id=task.id, payload=reaction_payload, message_id=None) + # self.journal.log_ranking(task, message_id=None, ranking=ranking.ranking) logger.info(f"Ranking {ranking.ranking} stored for task {task.id}.") @@ -346,13 +379,13 @@ class PromptRepository: return message_toxicity @managed_tx_method(CommitMode.FLUSH) - def insert_message_embedding(self, message_id: UUID, model: str, embedding: List[float]) -> MessageEmbedding: + def insert_message_embedding(self, message_id: UUID, model: str, embedding: list[float]) -> MessageEmbedding: """Insert the embedding of a new message in the database. Args: message_id (UUID): the identifier of the message we want to save its embedding model (str): the model used for creating the embedding - embedding (List[float]): the values obtained from the message & model + embedding (list[float]): the values obtained from the message & model Raises: OasstError: if misses some of the before params @@ -366,7 +399,9 @@ class PromptRepository: return message_embedding @managed_tx_method(CommitMode.FLUSH) - def insert_reaction(self, task_id: UUID, payload: db_payload.ReactionPayload) -> MessageReaction: + def insert_reaction( + self, task_id: UUID, payload: db_payload.ReactionPayload, message_id: Optional[UUID] + ) -> MessageReaction: self.ensure_user_is_enabled() container = PayloadContainer(payload=payload) @@ -376,12 +411,13 @@ class PromptRepository: payload=container, api_client_id=self.api_client.id, payload_type=type(payload).__name__, + message_id=message_id, ) self.db.add(reaction) return reaction @managed_tx_method(CommitMode.FLUSH) - def store_text_labels(self, text_labels: protocol_schema.TextLabels) -> Tuple[TextLabels, Task, Message]: + def store_text_labels(self, text_labels: protocol_schema.TextLabels) -> tuple[TextLabels, Task, Message]: valid_labels: Optional[list[str]] = None mandatory_labels: Optional[list[str]] = None @@ -441,11 +477,24 @@ class PromptRepository: user_id=self.user_id, text=text_labels.text, labels=text_labels.labels, + task_id=task.id if task else None, ) if message_id: - message = self.fetch_message(message_id) - if task: + if not task: + if text_labels.is_report is True: + message = self.handle_message_emoji( + message_id, protocol_schema.EmojiOp.add, protocol_schema.EmojiCode.red_flag + ) + + # update existing record for repeated updates (same user no task associated) + existing_text_label = self.fetch_non_task_text_labels(message_id, self.user_id) + if existing_text_label is not None: + existing_text_label.labels = text_labels.labels + model = existing_text_label + + else: + message = self.fetch_message(message_id) message.review_count += 1 self.db.add(message) @@ -519,6 +568,46 @@ class PromptRepository: qry = qry.filter(Message.review_result) if not include_deleted: qry = qry.filter(not_(Message.deleted)) + return self._add_user_emojis_all(qry) + + def check_users_recent_replies_for_duplicates(self, text: str) -> bool: + """ + Checks if the user has recently replied with the same text within a given time period. + """ + + user_id = self.user_id + logger.debug(f"Checking for duplicate tasks for user {user_id}") + # messages in the past 24 hours + messages = ( + self.db.query(Message) + .filter(Message.user_id == user_id) + .order_by(Message.created_date.desc()) + .filter( + Message.created_date > utcnow() - timedelta(minutes=settings.DUPLICATE_MESSAGE_FILTER_WINDOW_MINUTES) + ) + .all() + ) + if not messages: + return False + for msg in messages: + if msg.text == text: + return True + return False + + def fetch_user_message_trees( + self, user_id: Message.user_id, reviewed: bool = True, include_deleted: bool = False + ) -> list[Message]: + qry = self.db.query(Message).filter(Message.user_id == user_id) + if reviewed: + qry = qry.filter(Message.review_result) + if not include_deleted: + qry = qry.filter(not_(Message.deleted)) + return self._add_user_emojis_all(qry) + + def fetch_message_trees_ready_for_export(self) -> list[MessageTreeState]: + qry = self.db.query(MessageTreeState).filter( + MessageTreeState.state == message_tree_state.State.READY_FOR_EXPORT + ) return qry.all() def fetch_multiple_random_replies(self, max_size: int = 5, message_role: str = None): @@ -556,11 +645,25 @@ class PromptRepository: return conversation, replies def fetch_message(self, message_id: UUID, fail_if_missing: bool = True) -> Optional[Message]: + qry = self.db.query(Message).filter(Message.id == message_id) + messages = self._add_user_emojis_all(qry) + message = messages[0] if messages else None + message = self.db.query(Message).filter(Message.id == message_id).one_or_none() if fail_if_missing and not message: raise OasstError("Message not found", OasstErrorCode.MESSAGE_NOT_FOUND, HTTP_404_NOT_FOUND) return message + def fetch_non_task_text_labels(self, message_id: UUID, user_id: UUID) -> Optional[TextLabels]: + + query = ( + self.db.query(TextLabels) + .outerjoin(Task, Task.id == TextLabels.id) + .filter(Task.id.is_(None), TextLabels.message_id == message_id, TextLabels.user_id == user_id) + ) + text_label = query.one_or_none() + return text_label + @staticmethod def trace_conversation(messages: list[Message] | dict[UUID, Message], last_message: Message) -> list[Message]: """ @@ -620,9 +723,27 @@ class PromptRepository: qry = qry.filter(Message.review_result) if exclude_deleted: qry = qry.filter(Message.deleted == sa.false()) - children = qry.all() + children = self._add_user_emojis_all(qry) return children + def fetch_message_siblings( + self, message: Message | UUID, reviewed: Optional[bool] = True, deleted: Optional[bool] = False + ) -> list[Message]: + """ + Get siblings of a message (other messages with the same parent_id) + """ + if isinstance(message, Message): + message = message.id + + parent_qry = self.db.query(Message.parent_id).filter(Message.id == message).subquery() + qry = self.db.query(Message).filter(Message.parent_id == parent_qry.c.parent_id) + if reviewed is not None: + qry = qry.filter(Message.review_result == reviewed) + if deleted is not None: + qry = qry.filter(Message.deleted == deleted) + siblings = self._add_user_emojis_all(qry) + return siblings + @staticmethod def trace_descendants(root: Message, messages: list[Message]) -> list[Message]: children = defaultdict(list) @@ -651,7 +772,7 @@ class PromptRepository: if max_depth is not None: desc = desc.filter(Message.depth <= max_depth) - desc = desc.all() + desc = self._add_user_emojis_all(desc) return self.trace_descendants(message, desc) @@ -665,6 +786,33 @@ class PromptRepository: max_message = max(tree, key=lambda m: m.children_count) return max_message, [m for m in tree if m.parent_id == max_message.id] + def _add_user_emojis_all(self, qry: Query) -> list[Message]: + if self.user_id is None: + return qry.all() + + sq = qry.subquery("m") + qry = ( + self.db.query(Message, func.string_agg(MessageEmoji.emoji, literal_column("','")).label("user_emojis")) + .select_entity_from(sq) + .outerjoin( + MessageEmoji, + and_( + sq.c.id == MessageEmoji.message_id, + MessageEmoji.user_id == self.user_id, + sq.c.emojis != JSON.NULL, + ), + ) + .group_by(sq) + ) + messages: list[Message] = [] + for x in qry: + m: Message = x.Message + user_emojis = x["user_emojis"] + if user_emojis: + m._user_emojis = user_emojis.split(",") + messages.append(m) + return messages + def query_messages_ordered_by_created_date( self, user_id: Optional[UUID] = None, @@ -679,6 +827,7 @@ class PromptRepository: deleted: Optional[bool] = None, desc: bool = False, limit: Optional[int] = 100, + lang: Optional[str] = None, ) -> list[Message]: if not self.api_client.trusted: if not api_client_id: @@ -693,7 +842,7 @@ class PromptRepository: if user_id: qry = qry.filter(Message.user_id == user_id) if username or auth_method: - if not username and auth_method: + if not (username and auth_method): raise OasstError("Auth method or username missing.", OasstErrorCode.AUTH_AND_USERNAME_REQUIRED) qry = qry.join(User) qry = qry.filter(User.username == username, User.auth_method == auth_method) @@ -743,7 +892,10 @@ class PromptRepository: if limit is not None: qry = qry.limit(limit) - return qry.all() + if lang is not None: + qry = qry.filter(Message.lang == lang) + + return self._add_user_emojis_all(qry) def update_children_counts(self, message_tree_id: UUID): sql_update_children_count = """ @@ -805,3 +957,82 @@ WHERE message.id = cc.id; deleted=result.get(True, 0), message_trees=result.get(None, 0), ) + + def handle_message_emoji(self, message_id: UUID, op: protocol_schema.EmojiOp, emoji: protocol_schema) -> Message: + self.ensure_user_is_enabled() + + message = self.fetch_message(message_id) + + # check if emoji exists + existing_emoji = ( + self.db.query(MessageEmoji) + .filter( + MessageEmoji.message_id == message_id, MessageEmoji.user_id == self.user_id, MessageEmoji.emoji == emoji + ) + .one_or_none() + ) + + if existing_emoji: + if op == protocol_schema.EmojiOp.add: + logger.info(f"Emoji record already exists {message_id=}, {emoji=}, {self.user_id=}") + return message + elif op == protocol_schema.EmojiOp.togggle: + op = protocol_schema.EmojiOp.remove + + if existing_emoji is None: + if op == protocol_schema.EmojiOp.remove: + logger.info(f"Emoji record not found {message_id=}, {emoji=}, {self.user_id=}") + return message + elif op == protocol_schema.EmojiOp.togggle: + op = protocol_schema.EmojiOp.add + + if op == protocol_schema.EmojiOp.add: + # hard coded exclusivity of thumbs_up & thumbs_down + if emoji == protocol_schema.EmojiCode.thumbs_up and message.has_user_emoji( + protocol_schema.EmojiCode.thumbs_down.value + ): + message = self.handle_message_emoji( + message_id, protocol_schema.EmojiOp.remove, protocol_schema.EmojiCode.thumbs_down + ) + elif emoji == protocol_schema.EmojiCode.thumbs_down and message.has_user_emoji( + protocol_schema.EmojiCode.thumbs_up.value + ): + message = self.handle_message_emoji( + message_id, protocol_schema.EmojiOp.remove, protocol_schema.EmojiCode.thumbs_up + ) + + # insert emoji record & increment count + message_emoji = MessageEmoji(message_id=message.id, user_id=self.user_id, emoji=emoji) + self.db.add(message_emoji) + emoji_counts = message.emojis + if not emoji_counts: + message.emojis = {emoji.value: 1} + else: + count = emoji_counts.get(emoji.value) or 0 + emoji_counts[emoji.value] = count + 1 + if message._user_emojis is None: + message._user_emojis = [] + if emoji.value not in message._user_emojis: + message._user_emojis.append(emoji.value) + elif op == protocol_schema.EmojiOp.remove: + # remove emoji record and & decrement count + message = self.fetch_message(message_id) + if message._user_emojis and emoji.value in message._user_emojis: + message._user_emojis.remove(emoji.value) + self.db.delete(existing_emoji) + emoji_counts = message.emojis + count = emoji_counts.get(emoji.value) + if count is not None: + if count == 1: + del emoji_counts[emoji.value] + else: + emoji_counts[emoji.value] = count - 1 + flag_modified(message, "emojis") + self.db.add(message) + else: + raise OasstError("Emoji op not supported", OasstErrorCode.EMOJI_OP_UNSUPPORTED) + + flag_modified(message, "emojis") + self.db.add(message) + self.db.flush() + return message diff --git a/backend/oasst_backend/schemas/text_labels.py b/backend/oasst_backend/schemas/text_labels.py index 9135c558..e846d8f4 100644 --- a/backend/oasst_backend/schemas/text_labels.py +++ b/backend/oasst_backend/schemas/text_labels.py @@ -1,13 +1,6 @@ -from typing import Optional - +from oasst_shared.schemas.protocol import LabelDescription from pydantic import BaseModel -class LabelOption(BaseModel): - name: str - display_text: str - help_text: Optional[str] - - class ValidLabelsResponse(BaseModel): - valid_labels: list[LabelOption] + valid_labels: list[LabelDescription] diff --git a/backend/oasst_backend/task_repository.py b/backend/oasst_backend/task_repository.py index eb100fe3..fdf6ebb7 100644 --- a/backend/oasst_backend/task_repository.py +++ b/backend/oasst_backend/task_repository.py @@ -1,3 +1,4 @@ +from datetime import timedelta from typing import Optional from uuid import UUID @@ -9,7 +10,7 @@ from oasst_backend.user_repository import UserRepository from oasst_backend.utils.database_utils import CommitMode, managed_tx_method from oasst_shared.exceptions.oasst_api_error import OasstError, OasstErrorCode from oasst_shared.schemas import protocol as protocol_schema -from sqlmodel import Session +from sqlmodel import Session, func, or_ from starlette.status import HTTP_404_NOT_FOUND @@ -100,6 +101,7 @@ class TaskRepository: message_id=task.message_id, conversation=task.conversation, reply=task.reply, + reply_message=task.reply_message, valid_labels=task.valid_labels, mandatory_labels=task.mandatory_labels, mode=task.mode, @@ -111,6 +113,7 @@ class TaskRepository: message_id=task.message_id, conversation=task.conversation, reply=task.reply, + reply_message=task.reply_message, valid_labels=task.valid_labels, mandatory_labels=task.mandatory_labels, mode=task.mode, @@ -219,3 +222,16 @@ class TaskRepository: def fetch_task_by_id(self, task_id: UUID) -> Task: task = self.db.query(Task).filter(Task.api_client_id == self.api_client.id, Task.id == task_id).one_or_none() return task + + def fetch_recent_reply_tasks( + self, max_age: timedelta = timedelta(minutes=5), done: bool = False, limit: int = 100 + ) -> list[Task]: + qry = self.db.query(Task).filter( + func.age(Task.created_date) < max_age, + or_(Task.payload_type == "AssistantReplyPayload", Task.payload_type == "PrompterReplyPayload"), + ) + if done is not None: + qry = qry.filter(Task.done == done) + if limit: + qry = qry.limit(limit) + return qry.all() diff --git a/backend/oasst_backend/tree_manager.py b/backend/oasst_backend/tree_manager.py index 1224af4a..782b9af8 100644 --- a/backend/oasst_backend/tree_manager.py +++ b/backend/oasst_backend/tree_manager.py @@ -1,5 +1,7 @@ +import json import random -from datetime import datetime +import sys +from datetime import datetime, timedelta from enum import Enum from http import HTTPStatus from typing import Any, Dict, List, Optional, Tuple @@ -7,11 +9,17 @@ from uuid import UUID import numpy as np import pydantic +from fastapi.encoders import jsonable_encoder from loguru import logger -from oasst_backend.api.v1.utils import prepare_conversation, prepare_conversation_message_list +from oasst_backend.api.v1.utils import ( + prepare_conversation, + prepare_conversation_message, + prepare_conversation_message_list, +) from oasst_backend.config import TreeManagerConfiguration, settings from oasst_backend.models import Message, MessageReaction, MessageTreeState, Task, TextLabels, User, message_tree_state from oasst_backend.prompt_repository import PromptRepository +from oasst_backend.utils import tree_export from oasst_backend.utils.database_utils import CommitMode, async_managed_tx_method, managed_tx_method from oasst_backend.utils.hugging_face import HfClassificationModel, HfEmbeddingModel, HfUrl, HuggingFaceAPI from oasst_backend.utils.ranking import ranked_pairs @@ -37,8 +45,9 @@ class TaskRole(Enum): class ActiveTreeSizeRow(pydantic.BaseModel): message_tree_id: UUID - tree_size: int goal_tree_size: int + tree_size: int + awaiting_review: Optional[int] @property def remaining_messages(self) -> int: @@ -89,8 +98,6 @@ class TreeManagerStats(pydantic.BaseModel): class TreeManager: - _all_text_labels = list(map(lambda x: x.value, protocol_schema.TextLabel)) - def __init__( self, db: Session, @@ -197,7 +204,7 @@ class TreeManager: lang = "en" logger.warning("Task availability request without lang tag received, assuming lang='en'.") - num_active_trees = self.query_num_active_trees(lang=lang) + num_active_trees = self.query_num_active_trees(lang=lang, exclude_ranking=True) extendible_parents = self.query_extendible_parents(lang=lang) prompts_need_review = self.query_prompts_need_review(lang=lang) replies_need_review = self.query_replies_need_review(lang=lang) @@ -211,6 +218,15 @@ class TreeManager: incomplete_rankings=incomplete_rankings, ) + @staticmethod + def _get_label_descriptions(valid_labels: list[TextLabels]) -> list[protocol_schema.LabelDescription]: + return [ + protocol_schema.LabelDescription( + name=l.value, widget=l.widget.value, display_text=l.display_text, help_text=l.help_text + ) + for l in valid_labels + ] + def next_task( self, desired_task_type: protocol_schema.TaskRequestType = protocol_schema.TaskRequestType.random, @@ -225,7 +241,7 @@ class TreeManager: lang = "en" logger.warning("Task request without lang tag received, assuming 'en'.") - num_active_trees = self.query_num_active_trees(lang=lang) + num_active_trees = self.query_num_active_trees(lang=lang, exclude_ranking=True) prompts_need_review = self.query_prompts_need_review(lang=lang) replies_need_review = self.query_replies_need_review(lang=lang) extendible_parents = self.query_extendible_parents(lang=lang) @@ -334,6 +350,7 @@ class TreeManager: message_tree_id = messages[-1].message_tree_id case TaskType.LABEL_REPLY: + if task_role == TaskRole.PROMPTER: replies_need_review = list(filter(lambda m: m.role == "prompter", replies_need_review)) elif task_role == TaskRole.ASSISTANT: @@ -349,58 +366,99 @@ class TreeManager: self.cfg.p_full_labeling_review_reply_prompter: float = 0.1 label_mode = protocol_schema.LabelTaskMode.full - valid_labels = self._all_text_labels + label_disposition = protocol_schema.LabelTaskDisposition.quality if message.role == "assistant": + valid_labels = self.cfg.labels_assistant_reply if ( desired_task_type == protocol_schema.TaskRequestType.random and random.random() > self.cfg.p_full_labeling_review_reply_assistant ): - valid_labels = list(map(lambda x: x.value, self.cfg.mandatory_labels_assistant_reply)) label_mode = protocol_schema.LabelTaskMode.simple + label_disposition = protocol_schema.LabelTaskDisposition.spam + valid_labels = list(self.cfg.mandatory_labels_assistant_reply) + if protocol_schema.TextLabel.quality not in valid_labels: + valid_labels.append(protocol_schema.TextLabel.quality) + logger.info(f"Generating a LabelAssistantReplyTask. ({label_mode=:s})") task = protocol_schema.LabelAssistantReplyTask( message_id=message.id, conversation=conversation, reply=message.text, - valid_labels=valid_labels, + reply_message=prepare_conversation_message(message), + valid_labels=list(map(lambda x: x.value, valid_labels)), mandatory_labels=list(map(lambda x: x.value, self.cfg.mandatory_labels_assistant_reply)), mode=label_mode, + disposition=label_disposition, + labels=self._get_label_descriptions(valid_labels), ) else: + valid_labels = self.cfg.labels_prompter_reply if ( desired_task_type == protocol_schema.TaskRequestType.random and random.random() > self.cfg.p_full_labeling_review_reply_prompter ): - valid_labels = list(map(lambda x: x.value, self.cfg.mandatory_labels_prompter_reply)) label_mode = protocol_schema.LabelTaskMode.simple + label_disposition = protocol_schema.LabelTaskDisposition.spam + valid_labels = list(self.cfg.mandatory_labels_prompter_reply) + if protocol_schema.TextLabel.quality not in valid_labels: + valid_labels.append(protocol_schema.TextLabel.quality) + logger.info(f"Generating a LabelPrompterReplyTask. ({label_mode=:s})") task = protocol_schema.LabelPrompterReplyTask( message_id=message.id, conversation=conversation, reply=message.text, - valid_labels=valid_labels, + reply_message=prepare_conversation_message(message), + valid_labels=list(map(lambda x: x.value, valid_labels)), mandatory_labels=list(map(lambda x: x.value, self.cfg.mandatory_labels_prompter_reply)), mode=label_mode, + disposition=label_disposition, + labels=self._get_label_descriptions(valid_labels), ) parent_message_id = message.id message_tree_id = message.message_tree_id case TaskType.REPLY: - # select a tree with missing replies + + recent_reply_tasks = self.pr.task_repository.fetch_recent_reply_tasks( + max_age=timedelta(seconds=self.cfg.recent_tasks_span_sec), done=False + ) + recent_reply_task_parents = {t.parent_message_id for t in recent_reply_tasks} + if task_role == TaskRole.PROMPTER: extendible_parents = list(filter(lambda x: x.parent_role == "assistant", extendible_parents)) elif task_role == TaskRole.ASSISTANT: extendible_parents = list(filter(lambda x: x.parent_role == "prompter", extendible_parents)) + # select a tree with missing replies if len(extendible_parents) > 0: - random_parent = random.choice(extendible_parents) + random_parent: ExtendibleParentRow = None + if self.cfg.p_lonely_child_extension > 0 and self.cfg.lonely_children_count > 1: + # check if we have extendible parents with a small number of replies + + lonely_children_parents = [ + p + for p in extendible_parents + if 0 < p.active_children_count < self.cfg.lonely_children_count + and p.parent_id not in recent_reply_task_parents + ] + if len(lonely_children_parents) > 0 and random.random() < self.cfg.p_lonely_child_extension: + random_parent = random.choice(lonely_children_parents) + + if random_parent is None: + # try to exclude parents for which tasks were recently handed out + fresh_parents = [p for p in extendible_parents if p.parent_id not in recent_reply_task_parents] + if len(fresh_parents) > 0: + random_parent = random.choice(fresh_parents) + else: + random_parent = random.choice(extendible_parents) # fetch random conversation to extend logger.debug(f"selected {random_parent=}") messages = self.pr.fetch_message_conversation(random_parent.parent_id) - assert all(m.review_result for m in messages) # ensure all messages have positive review + assert all(m.review_result for m in messages) # ensure all messages have positive reviews conversation = prepare_conversation(messages) # generate reply task depending on last message @@ -419,19 +477,23 @@ class TreeManager: message = random.choice(prompts_need_review) label_mode = protocol_schema.LabelTaskMode.full - valid_labels = self._all_text_labels + label_disposition = protocol_schema.LabelTaskDisposition.quality + valid_labels = self.cfg.labels_initial_prompt if random.random() > self.cfg.p_full_labeling_review_prompt: - valid_labels = list(map(lambda x: x.value, self.cfg.mandatory_labels_initial_prompt)) + valid_labels = self.cfg.mandatory_labels_initial_prompt label_mode = protocol_schema.LabelTaskMode.simple + label_disposition = protocol_schema.LabelTaskDisposition.spam logger.info(f"Generating a LabelInitialPromptTask ({label_mode=:s}).") task = protocol_schema.LabelInitialPromptTask( message_id=message.id, prompt=message.text, - valid_labels=valid_labels, + valid_labels=list(map(lambda x: x.value, valid_labels)), mandatory_labels=list(map(lambda x: x.value, self.cfg.mandatory_labels_initial_prompt)), mode=label_mode, + disposition=label_disposition, + labels=self._get_label_descriptions(valid_labels), ) parent_message_id = message.id @@ -464,14 +526,6 @@ class TreeManager: logger.info( f"Frontend reports text reply to {interaction.message_id=} with {interaction.text=} by {interaction.user=}." ) - - # ensure message size is below the predefined limit - if len(interaction.text) > settings.MESSAGE_SIZE_LIMIT: - logger.error( - f"Message size {len(interaction.text)=} exceeds size limit of {settings.MESSAGE_SIZE_LIMIT=}." - ) - raise OasstError("Message size too long.", OasstErrorCode.TASK_MESSAGE_TOO_LONG) - # here we store the text reply in the database message = pr.store_text_reply( text=interaction.text, @@ -502,19 +556,18 @@ class TreeManager: try: model_name: str = HfClassificationModel.TOXIC_ROBERTA.value hugging_face_api: HuggingFaceAPI = HuggingFaceAPI( - f"{HfUrl.HUGGINGFACE_FEATURE_EXTRACTION.value}/{model_name}" + f"{HfUrl.HUGGINGFACE_TOXIC_CLASSIFICATION.value}/{model_name}" ) toxicity: List[List[Dict[str, Any]]] = await hugging_face_api.post(interaction.text) toxicity = toxicity[0][0] - pr.insert_toxicity( message_id=message.id, model=model_name, score=toxicity["score"], label=toxicity["label"] ) except OasstError: logger.error( - f"Could not compute toxicity for text reply to {interaction.message_id=} with {interaction.text=} by {interaction.user=}." + f"Could not compute toxicity for text reply to {interaction.message_id=} with {interaction.text=} by {interaction.user=}." ) case protocol_schema.MessageRating: @@ -530,9 +583,7 @@ class TreeManager: ) _, task = pr.store_ranking(interaction) - - ok, rankings_by_message = self.check_condition_for_scoring_state(task.message_tree_id) - self.update_message_ranks(task.message_tree_id, rankings_by_message) + self.check_condition_for_scoring_state(task.message_tree_id) case protocol_schema.TextLabels: logger.info( @@ -541,7 +592,7 @@ class TreeManager: _, task, msg = pr.store_text_labels(interaction) - # if it was a respones for a task, check if we have enough reviews to calc review_result + # if it was a response for a task, check if we have enough reviews to calc review_result if task and msg: reviews = self.query_reviews_for_message(msg.id) acceptance_score = self._calculate_acceptance(reviews) @@ -622,8 +673,8 @@ class TreeManager: # check if desired tree size has been reached and all nodes have been reviewed tree_size = self.query_tree_size(message_tree_id) - if tree_size.remaining_messages > 0: - logger.debug(f"False {tree_size.remaining_messages=}") + if tree_size.remaining_messages > 0 or tree_size.awaiting_review > 0: + logger.debug(f"False {tree_size.remaining_messages=}, {tree_size.awaiting_review=}") return False self._enter_state(mts, message_tree_state.State.RANKING) @@ -647,9 +698,12 @@ class TreeManager: return False, None self._enter_state(mts, message_tree_state.State.READY_FOR_SCORING) - return True, rankings_by_message + self.update_message_ranks(message_tree_id, rankings_by_message) + return True - def update_message_ranks(self, message_tree_id: UUID, rankings_by_message: Dict[int, int]) -> bool: + def update_message_ranks( + self, message_tree_id: UUID, rankings_by_message: dict[UUID, list[MessageReaction]] + ) -> bool: mts = self.pr.fetch_tree_state(message_tree_id) # check state, allow retry if in SCORING_FAILED state @@ -657,19 +711,47 @@ class TreeManager: logger.debug(f"False {mts.active=}, {mts.state=}") return False + if mts.state == message_tree_state.State.SCORING_FAILED: + mts.active = True + mts.state = message_tree_state.State.READY_FOR_SCORING + try: for rankings in rankings_by_message.values(): - sorted_messages = [] - for msg_reaction in rankings: - sorted_messages.append(msg_reaction.payload.payload.ranked_message_ids) - logger.debug(f"SORTED MESSAGE {sorted_messages}") - consensus = ranked_pairs(sorted_messages) + ordered_ids_list: list[list[UUID]] = [ + msg_reaction.payload.payload.ranked_message_ids for msg_reaction in rankings + ] + + common_set: set[UUID] = set.intersection(*map(set, ordered_ids_list)) + if len(common_set) < 2: + logger.warning("The intersection of ranking results ID sets has less than two elements. Skipping.") + continue + + # keep only elements in commond set + ordered_ids_list = [list(filter(lambda x: x in common_set, ids)) for ids in ordered_ids_list] + assert all(len(x) == len(common_set) for x in ordered_ids_list) + + logger.debug(f"SORTED MESSAGE IDS {ordered_ids_list}") + consensus = ranked_pairs(ordered_ids_list) + assert len(consensus) == len(common_set) logger.debug(f"CONSENSUS: {consensus}\n\n") + + # fetch all siblings and clear ranks + siblings = self.pr.fetch_message_siblings(consensus[0], reviewed=None, deleted=None) + for m in siblings: + m.rank = None + self.db.add(m) + + # index by id + siblings = {m.id: m for m in siblings} + + # set rank for each message that was part of the common set for rank, message_id in enumerate(consensus): - # set rank for each message_id for Message rows - msg = self.pr.fetch_message(message_id=message_id, fail_if_missing=True) - msg.rank = rank - self.db.add(msg) + msg = siblings.get(message_id) + if msg: + msg.rank = rank + self.db.add(msg) + else: + logger.warning(f"Message {message_id=} not found among siblings.") except Exception: logger.exception(f"update_message_ranks({message_tree_id=}) failed") @@ -683,57 +765,65 @@ class TreeManager: # calculate acceptance based on spam label return np.mean([1 - l.labels[protocol_schema.TextLabel.spam] for l in labels]) - def query_prompts_need_review(self, lang: str) -> list[Message]: - """ - Select initial prompt messages with less then required rankings in active message tree - (active == True in message_tree_state) - """ + def _query_need_review( + self, state: message_tree_state.State, required_reviews: int, root: bool, lang: str + ) -> list[Message]: - qry = ( + need_review = ( self.db.query(Message) .select_from(MessageTreeState) .join(Message, MessageTreeState.message_tree_id == Message.message_tree_id) .filter( MessageTreeState.active, - MessageTreeState.state == message_tree_state.State.INITIAL_PROMPT_REVIEW, + MessageTreeState.state == state, not_(Message.review_result), not_(Message.deleted), - Message.review_count < self.cfg.num_reviews_initial_prompt, - Message.parent_id.is_(None), + Message.review_count < required_reviews, Message.lang == lang, ) ) + if root: + need_review = need_review.filter(Message.parent_id.is_(None)) + else: + need_review = need_review.filter(Message.parent_id.is_not(None)) + if not settings.DEBUG_ALLOW_SELF_LABELING: - qry = qry.filter(Message.user_id != self.pr.user_id) + need_review = need_review.filter(Message.user_id != self.pr.user_id) + + if settings.DEBUG_ALLOW_DUPLICATE_TASKS: + qry = need_review + else: + user_id = self.pr.user_id + need_review = need_review.cte(name="need_review") + qry = ( + self.db.query(Message) + .select_entity_from(need_review) + .outerjoin(TextLabels, need_review.c.id == TextLabels.message_id) + .group_by(need_review) + .having( + func.count(TextLabels.id).filter(TextLabels.task_id.is_not(None), TextLabels.user_id == user_id) + == 0 + ) + ) return qry.all() + def query_prompts_need_review(self, lang: str) -> list[Message]: + """ + Select initial prompt messages with less then required rankings in active message tree + (active == True in message_tree_state) + """ + return self._query_need_review( + message_tree_state.State.INITIAL_PROMPT_REVIEW, self.cfg.num_reviews_initial_prompt, True, lang + ) + def query_replies_need_review(self, lang: str) -> list[Message]: """ Select child messages (parent_id IS NOT NULL) with less then required rankings in active message tree (active == True in message_tree_state) """ - - qry = ( - self.db.query(Message) - .select_from(MessageTreeState) - .join(Message, MessageTreeState.message_tree_id == Message.message_tree_id) - .filter( - MessageTreeState.active, - MessageTreeState.state == message_tree_state.State.GROWING, - not_(Message.review_result), - not_(Message.deleted), - Message.review_count < self.cfg.num_reviews_reply, - Message.parent_id.is_not(None), - Message.lang == lang, - ) - ) - - if not settings.DEBUG_ALLOW_SELF_LABELING: - qry = qry.filter(Message.user_id != self.pr.user_id) - - return qry.all() + return self._query_need_review(message_tree_state.State.GROWING, self.cfg.num_reviews_reply, False, lang) _sql_find_incomplete_rankings = """ -- find incomplete rankings @@ -749,17 +839,28 @@ WHERE mts.active -- only consider active trees AND m.parent_id IS NOT NULL -- ignore initial prompts GROUP BY m.parent_id, m.role HAVING COUNT(m.id) > 1 and MIN(m.ranking_count) < :num_required_rankings +""" + + _sql_find_incomplete_rankings_ex = f""" +-- incomplete rankings but exclude of current user +WITH incomplete_rankings AS ({_sql_find_incomplete_rankings}) +SELECT ir.* FROM incomplete_rankings ir + LEFT JOIN message_reaction mr ON ir.parent_id = mr.message_id AND mr.payload_type = 'RankingReactionPayload' +GROUP BY ir.parent_id, ir.role, ir.children_count, ir.child_min_ranking_count, ir.completed_rankings +HAVING(COUNT(mr.message_id) FILTER (WHERE mr.user_id = :user_id) = 0) """ def query_incomplete_rankings(self, lang: str) -> list[IncompleteRankingsRow]: """Query parents which have childern that need further rankings""" + user_id = self.pr.user_id if not settings.DEBUG_ALLOW_DUPLICATE_TASKS else None r = self.db.execute( - text(self._sql_find_incomplete_rankings), + text(self._sql_find_incomplete_rankings_ex), { "num_required_rankings": self.cfg.num_required_rankings, "ranking_state": message_tree_state.State.RANKING, "lang": lang, + "user_id": user_id, }, ) return [IncompleteRankingsRow.from_orm(x) for x in r.all()] @@ -780,17 +881,20 @@ WHERE mts.active -- only consider active trees AND (c.review_result OR coalesce(c.review_count, 0) < :num_reviews_reply) -- don't count children with negative review but count elements under review GROUP BY m.id, m.role, m.depth, m.message_tree_id, mts.max_children_count HAVING COUNT(c.id) < mts.max_children_count -- below maximum number of children + AND COUNT(c.id) FILTER (WHERE c.user_id = :user_id) = 0 -- without reply by user """ def query_extendible_parents(self, lang: str) -> list[ExtendibleParentRow]: """Query parent messages that have not reached the maximum number of replies.""" + user_id = self.pr.user_id if not settings.DEBUG_ALLOW_DUPLICATE_TASKS else None r = self.db.execute( text(self._sql_find_extendible_parents), { "growing_state": message_tree_state.State.GROWING, "num_reviews_reply": self.cfg.num_reviews_reply, "lang": lang, + "user_id": user_id, }, ) return [ExtendibleParentRow.from_orm(x) for x in r.all()] @@ -814,12 +918,14 @@ HAVING COUNT(m.id) < mts.goal_tree_size def query_extendible_trees(self, lang: str) -> list[ActiveTreeSizeRow]: """Query size of active message trees in growing state.""" + user_id = self.pr.user_id if not settings.DEBUG_ALLOW_DUPLICATE_TASKS else None r = self.db.execute( text(self._sql_find_extendible_trees), { "growing_state": message_tree_state.State.GROWING, "num_reviews_reply": self.cfg.num_reviews_reply, "lang": lang, + "user_id": user_id, }, ) return [ActiveTreeSizeRow.from_orm(x) for x in r.all()] @@ -827,18 +933,21 @@ HAVING COUNT(m.id) < mts.goal_tree_size def query_tree_size(self, message_tree_id: UUID) -> ActiveTreeSizeRow: """Returns the number of reviewed not deleted messages in the message tree.""" + required_reviews = settings.tree_manager.num_reviews_reply qry = ( self.db.query( MessageTreeState.message_tree_id.label("message_tree_id"), MessageTreeState.goal_tree_size.label("goal_tree_size"), - func.count(Message.id).label("tree_size"), + func.count(Message.id).filter(Message.review_result).label("tree_size"), + func.count(Message.id) + .filter(not_(Message.review_result), Message.review_count < required_reviews) + .label("awaiting_review"), ) .select_from(MessageTreeState) .outerjoin(Message, MessageTreeState.message_tree_id == Message.message_tree_id) .filter( MessageTreeState.active, not_(Message.deleted), - Message.review_result, MessageTreeState.message_tree_id == message_tree_id, ) .group_by(MessageTreeState.message_tree_id, MessageTreeState.goal_tree_size) @@ -907,7 +1016,7 @@ INNER JOIN message_reaction mr ON mr.task_id = t.id AND mr.payload_type = 'Ranki return rankings_by_message @managed_tx_method(CommitMode.COMMIT) - def ensure_tree_states(self): + def ensure_tree_states(self) -> None: """Add message tree state rows for all root nodes (inital prompt messages).""" missing_tree_ids = self.query_misssing_tree_states() @@ -919,12 +1028,23 @@ INNER JOIN message_reaction mr ON mr.task_id = t.id AND mr.payload_type = 'Ranki logger.info(f"Inserting missing message tree state for message: {id} ({tree_size=}, {state=:s})") self._insert_default_state(id, state=state) - def query_num_active_trees(self, lang: str) -> int: + rankings = ( + self.db.query(MessageTreeState).filter(MessageTreeState.state == message_tree_state.State.RANKING).all() + ) + if len(rankings) > 0: + logger.info(f"Checking state of {len(rankings)} message trees in ranking state.") + for r in rankings: + self.check_condition_for_scoring_state(r.message_tree_id) + + def query_num_active_trees(self, lang: str, exclude_ranking: bool = True) -> int: + """Count all active trees (optionally exclude those in ranking state).""" query = ( self.db.query(func.count(MessageTreeState.message_tree_id)) .join(Message, MessageTreeState.message_tree_id == Message.id) .filter(MessageTreeState.active, Message.lang == lang) ) + if exclude_ranking: + query = query.filter(MessageTreeState.state != message_tree_state.State.RANKING) return query.scalar() def query_reviews_for_message(self, message_id: UUID) -> list[TextLabels]: @@ -1150,6 +1270,7 @@ DELETE FROM message WHERE message_tree_id = :message_tree_id; sql_purge_user = """ DELETE FROM journal WHERE user_id = :user_id; DELETE FROM message_reaction WHERE user_id = :user_id; +DELETE FROM message_emoji WHERE user_id = :user_id; DELETE FROM task WHERE user_id = :user_id; DELETE FROM message WHERE user_id = :user_id; DELETE FROM user_stats WHERE user_id = :user_id; @@ -1161,41 +1282,92 @@ DELETE FROM user_stats WHERE user_id = :user_id; if ban: self.db.execute(update(User).filter(User.id == user_id).values(deleted=True, enabled=False)) + def export_trees_to_file( + self, + message_tree_ids: list[str], + file=None, + reviewed: bool = True, + include_deleted: bool = False, + use_compression: bool = False, + ) -> None: + trees_to_export: List[tree_export.ExportMessageTree] = [] + + for message_tree_id in message_tree_ids: + messages: List[Message] = self.pr.fetch_message_tree(message_tree_id, reviewed, include_deleted) + trees_to_export.append(tree_export.build_export_tree(message_tree_id, messages)) + + if file: + tree_export.write_trees_to_file(file, trees_to_export, use_compression) + else: + sys.stdout.write(json.dumps(jsonable_encoder(trees_to_export), indent=2)) + + def export_all_ready_trees( + self, file: str, reviewed: bool = True, include_deleted: bool = False, use_compression: bool = False + ) -> None: + message_tree_states: MessageTreeState = self.pr.fetch_message_trees_ready_for_export() + message_tree_ids = [ms.message_tree_id for ms in message_tree_states] + self.export_trees_to_file(message_tree_ids, file, reviewed, include_deleted, use_compression) + + def export_all_user_trees( + self, + user_id: str, + file: str, + reviewed: bool = True, + include_deleted: bool = False, + use_compression: bool = False, + ) -> None: + messages = self.pr.fetch_user_message_trees(UUID(user_id)) + message_tree_ids = [ms.message_tree_id for ms in messages] + self.export_trees_to_file(message_tree_ids, file, reviewed, include_deleted, use_compression) + + @managed_tx_method(CommitMode.COMMIT) + def retry_scoring_failed_message_trees(self): + query = self.db.query(MessageTreeState.message_tree_id).filter( + MessageTreeState.state == message_tree_state.State.SCORING_FAILED + ) + ranking_role_filter = None if self.cfg.rank_prompter_replies else "assistant" + for row in query.all(): + try: + message_tree_id = row["message_tree_id"] + rankings_by_message = self.query_tree_ranking_results(message_tree_id, role_filter=ranking_role_filter) + self.update_message_ranks(message_tree_id=message_tree_id, rankings_by_message=rankings_by_message) + except Exception: + logger.exception(f"retry_scoring_failed_message_trees failed for ({message_tree_id=})") + if __name__ == "__main__": from oasst_backend.api.deps import api_auth + + # from oasst_backend.api.deps import create_api_client from oasst_backend.database import engine from oasst_backend.prompt_repository import PromptRepository with Session(engine) as db: api_client = api_auth(settings.OFFICIAL_WEB_API_KEY, db=db) + # api_client = create_api_client(session=db, description="test", frontend_type="bot") dummy_user = protocol_schema.User(id="__dummy_user__", display_name="Dummy User", auth_method="local") pr = PromptRepository(db=db, api_client=api_client, client_user=dummy_user) - cfg = TreeManagerConfiguration() tm = TreeManager(db, pr, cfg) tm.ensure_tree_states() - tm.purge_user_messages(user_id=UUID("2ef9ad21-0dc5-442d-8750-6f7f1790723f"), purge_initial_prompts=False) + # tm.purge_user_messages(user_id=UUID("2ef9ad21-0dc5-442d-8750-6f7f1790723f"), purge_initial_prompts=False) # tm.purge_user(user_id=UUID("2ef9ad21-0dc5-442d-8750-6f7f1790723f")) # db.commit() # print("query_num_active_trees", tm.query_num_active_trees()) # print("query_incomplete_rankings", tm.query_incomplete_rankings()) # print("query_replies_need_review", tm.query_replies_need_review()) + # print("query_incomplete_reply_reviews", tm.query_replies_need_review()) # print("query_incomplete_initial_prompt_reviews", tm.query_prompts_need_review()) # print("query_extendible_trees", tm.query_extendible_trees()) # print("query_extendible_parents", tm.query_extendible_parents()) - # print("query_tree_size", tm.query_tree_size(message_tree_id=UUID("bdf434cf-4df5-4b74-949c-a5a157bc3292"))) - - # print( - # "query_reviews_for_message", - # tm.query_reviews_for_message(message_id=UUID("6a444493-0d48-4316-a9f1-7e263f5a2473")), - # ) # print("next_task:", tm.next_task()) # print( - # "query_tree_ranking_results", tm.query_tree_ranking_results(UUID("6036f58f-41b5-48c4-bdd9-b16f34ab1312")) + # ".query_tree_ranking_results", tm.query_tree_ranking_results(UUID("2ac20d38-6650-43aa-8bb3-f61080c0d921")) # ) + + # print(tm.export_trees_to_file(message_tree_ids=["7e75fb38-e664-4e2b-817c-b9a0b01b0074"], file="lol.jsonl")) diff --git a/backend/oasst_backend/user_repository.py b/backend/oasst_backend/user_repository.py index 44a7e685..6f96748f 100644 --- a/backend/oasst_backend/user_repository.py +++ b/backend/oasst_backend/user_repository.py @@ -1,10 +1,12 @@ from typing import Optional from uuid import UUID +from oasst_backend.config import settings from oasst_backend.models import ApiClient, User from oasst_backend.utils.database_utils import CommitMode, managed_tx_method from oasst_shared.exceptions import OasstError, OasstErrorCode from oasst_shared.schemas import protocol as protocol_schema +from sqlalchemy.exc import IntegrityError from sqlmodel import Session, and_, or_ from starlette.status import HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND @@ -64,7 +66,13 @@ class UserRepository: return user @managed_tx_method(CommitMode.COMMIT) - def update_user(self, id: UUID, enabled: Optional[bool] = None, notes: Optional[str] = None) -> None: + def update_user( + self, + id: UUID, + enabled: Optional[bool] = None, + notes: Optional[str] = None, + show_on_leaderboard: Optional[bool] = None, + ) -> None: """ Update a user by global user ID to disable or set admin notes. Only trusted clients may update users. @@ -83,6 +91,8 @@ class UserRepository: user.enabled = enabled if notes is not None: user.notes = notes + if show_on_leaderboard is not None: + user.show_on_leaderboard = show_on_leaderboard self.db.add(user) @@ -107,9 +117,7 @@ class UserRepository: self.db.add(user) @managed_tx_method(CommitMode.COMMIT) - def lookup_client_user(self, client_user: protocol_schema.User, create_missing: bool = True) -> Optional[User]: - if not client_user: - return None + def _lookup_client_user_tx(self, client_user: protocol_schema.User, create_missing: bool = True) -> Optional[User]: user: User = ( self.db.query(User) .filter( @@ -135,6 +143,18 @@ class UserRepository: self.db.add(user) return user + def lookup_client_user(self, client_user: protocol_schema.User, create_missing: bool = True) -> Optional[User]: + if not client_user: + return None + num_retries = settings.DATABASE_MAX_TX_RETRY_COUNT + for i in range(num_retries): + try: + return self._lookup_client_user_tx(client_user, create_missing) + except IntegrityError: + # catch UniqueViolation exception, for concurrent requests due to conflicts in ix_user_username + if i + 1 == num_retries: + raise + def query_users_ordered_by_username( self, api_client_id: Optional[UUID] = None, @@ -145,6 +165,7 @@ class UserRepository: auth_method: Optional[str] = None, search_text: Optional[str] = None, limit: Optional[int] = 100, + desc: bool = False, ) -> list[User]: if not self.api_client.trusted: if not api_client_id: @@ -184,14 +205,13 @@ class UserRepository: pattern = "%{}%".format(search_text.replace("\\", "\\\\").replace("_", "\\_").replace("%", "\\%")) qry = qry.filter(User.username.like(pattern)) - if limit is not None and lte_username and not gte_username: - # select top rows but return results in ascernding order - sub_qry = qry.order_by(User.username.desc(), User.id.desc()).limit(limit).subquery("u") - qry = self.db.query(User).select_entity_from(sub_qry).order_by(User.username, User.id) + if desc: + qry = qry.order_by(User.username.desc(), User.id.desc()) else: qry = qry.order_by(User.username, User.id) - if limit is not None: - qry = qry.limit(limit) + + if limit is not None: + qry = qry.limit(limit) return qry.all() @@ -205,7 +225,9 @@ class UserRepository: auth_method: Optional[str] = None, search_text: Optional[str] = None, limit: Optional[int] = 100, + desc: bool = False, ) -> list[User]: + if not self.api_client.trusted: if not api_client_id: # Let unprivileged api clients query their own users without api_client_id being set @@ -255,13 +277,12 @@ class UserRepository: if auth_method: qry = qry.filter(User.auth_method == auth_method) - if limit is not None and lte_display_name and not gte_display_name: - # select top rows but return results in ascernding order - sub_qry = qry.order_by(User.display_name.desc(), User.id.desc()).limit(limit).subquery("u") - qry = self.db.query(User).select_entity_from(sub_qry).order_by(User.display_name, User.id) + if desc: + qry = qry.order_by(User.display_name.desc(), User.id.desc()) else: qry = qry.order_by(User.display_name, User.id) - if limit is not None: - qry = qry.limit(limit) + + if limit is not None: + qry = qry.limit(limit) return qry.all() diff --git a/backend/oasst_backend/user_stats_repository.py b/backend/oasst_backend/user_stats_repository.py index 8b0c17f0..81e5cb92 100644 --- a/backend/oasst_backend/user_stats_repository.py +++ b/backend/oasst_backend/user_stats_repository.py @@ -39,7 +39,7 @@ class UserStatsRepository: qry = ( self.session.query(User.id.label("user_id"), User.username, User.auth_method, User.display_name, UserStats) .join(UserStats, User.id == UserStats.user_id) - .filter(UserStats.time_frame == time_frame.value) + .filter(UserStats.time_frame == time_frame.value, User.show_on_leaderboard) .order_by(UserStats.rank) .limit(limit) ) @@ -250,7 +250,8 @@ FROM PARTITION BY time_frame ORDER BY leader_score DESC, user_id ) AS "rank", user_id, time_frame - FROM user_stats + FROM user_stats us2 + INNER JOIN "user" u ON us2.user_id = u.id AND u.show_on_leaderboard WHERE (:time_frame IS NULL OR time_frame = :time_frame)) AS r WHERE us.user_id = r.user_id diff --git a/backend/oasst_backend/utils/database_utils.py b/backend/oasst_backend/utils/database_utils.py index 803b81e8..5ac25f50 100644 --- a/backend/oasst_backend/utils/database_utils.py +++ b/backend/oasst_backend/utils/database_utils.py @@ -7,9 +7,14 @@ from loguru import logger from oasst_backend.config import settings from oasst_backend.database import engine from oasst_shared.exceptions import OasstError, OasstErrorCode -from sqlalchemy.exc import OperationalError +from psycopg2.errors import DeadlockDetected, ExclusionViolation, SerializationFailure, UniqueViolation +from sqlalchemy.exc import OperationalError, PendingRollbackError from sqlmodel import Session, SQLModel +""" +Error Handling Reference: https://www.postgresql.org/docs/15/mvcc-serialization-failure-handling.html +""" + class CommitMode(IntEnum): """ @@ -26,7 +31,6 @@ class CommitMode(IntEnum): * managed_tx_method and async_managed_tx_method methods are decorators functions * to be used on class functions. It expects the Class to have a 'db' Session object * initialised -* TODO: tx method decorator for non class methods """ @@ -35,28 +39,46 @@ def managed_tx_method(auto_commit: CommitMode = CommitMode.COMMIT, num_retries=s @wraps(f) def wrapped_f(self, *args, **kwargs): try: - for i in range(num_retries): - try: - result = f(self, *args, **kwargs) - if auto_commit == CommitMode.COMMIT: + result = None + if auto_commit == CommitMode.COMMIT: + retry_exhausted = True + for i in range(num_retries): + try: + result = f(self, *args, **kwargs) self.db.commit() - elif auto_commit == CommitMode.FLUSH: - self.db.flush() - elif auto_commit == CommitMode.ROLLBACK: + if isinstance(result, SQLModel): + self.db.refresh(result) + retry_exhausted = False + break + except PendingRollbackError as e: + logger.info(str(e)) self.db.rollback() + except OperationalError as e: + if e.orig is not None and isinstance( + e.orig, (SerializationFailure, DeadlockDetected, UniqueViolation, ExclusionViolation) + ): + logger.info(f"{type(e.orig)} Inner {e.orig.pgcode} {type(e.orig.pgcode)}") + self.db.rollback() + else: + raise e + logger.info(f"Retry {i+1}/{num_retries}") + if retry_exhausted: + raise OasstError( + "DATABASE_MAX_RETIRES_EXHAUSTED", + error_code=OasstErrorCode.DATABASE_MAX_RETRIES_EXHAUSTED, + http_status_code=HTTPStatus.SERVICE_UNAVAILABLE, + ) + else: + result = f(self, *args, **kwargs) + if auto_commit == CommitMode.FLUSH: + self.db.flush() if isinstance(result, SQLModel): self.db.refresh(result) - return result - except OperationalError: - logger.info(f"Retry {i+1}/{num_retries} after possible DB concurrent update conflict.") + elif auto_commit == CommitMode.ROLLBACK: self.db.rollback() - raise OasstError( - "DATABASE_MAX_RETIRES_EXHAUSTED", - error_code=OasstErrorCode.DATABASE_MAX_RETRIES_EXHAUSTED, - http_status_code=HTTPStatus.SERVICE_UNAVAILABLE, - ) + return result except Exception as e: - logger.error("DB Rollback Failure") + logger.info(str(e)) raise e return wrapped_f @@ -71,28 +93,46 @@ def async_managed_tx_method( @wraps(f) async def wrapped_f(self, *args, **kwargs): try: - for i in range(num_retries): - try: - result = await f(self, *args, **kwargs) - if auto_commit == CommitMode.COMMIT: + result = None + if auto_commit == CommitMode.COMMIT: + retry_exhausted = True + for i in range(num_retries): + try: + result = await f(self, *args, **kwargs) self.db.commit() - elif auto_commit == CommitMode.FLUSH: - self.db.flush() - elif auto_commit == CommitMode.ROLLBACK: + if isinstance(result, SQLModel): + self.db.refresh(result) + retry_exhausted = False + break + except PendingRollbackError as e: + logger.info(str(e)) self.db.rollback() + except OperationalError as e: + if e.orig is not None and isinstance( + e.orig, (SerializationFailure, DeadlockDetected, UniqueViolation, ExclusionViolation) + ): + logger.info(f"{type(e.orig)} Inner {e.orig.pgcode} {type(e.orig.pgcode)}") + self.db.rollback() + else: + raise e + logger.info(f"Retry {i+1}/{num_retries}") + if retry_exhausted: + raise OasstError( + "DATABASE_MAX_RETIRES_EXHAUSTED", + error_code=OasstErrorCode.DATABASE_MAX_RETRIES_EXHAUSTED, + http_status_code=HTTPStatus.SERVICE_UNAVAILABLE, + ) + else: + result = await f(self, *args, **kwargs) + if auto_commit == CommitMode.FLUSH: + self.db.flush() if isinstance(result, SQLModel): self.db.refresh(result) - return result - except OperationalError: - logger.info(f"Retry {i+1}/{num_retries} after possible DB concurrent update conflict.") + elif auto_commit == CommitMode.ROLLBACK: self.db.rollback() - raise OasstError( - "DATABASE_MAX_RETIRES_EXHAUSTED", - error_code=OasstErrorCode.DATABASE_MAX_RETRIES_EXHAUSTED, - http_status_code=HTTPStatus.SERVICE_UNAVAILABLE, - ) + return result except Exception as e: - logger.exception("DB Rollback Failure") + logger.info(str(e)) raise e return wrapped_f @@ -115,27 +155,49 @@ def managed_tx_function( @wraps(f) def wrapped_f(*args, **kwargs): try: - for i in range(num_retries): - with session_factory() as session: - try: - result = f(session, *args, **kwargs) - if auto_commit == CommitMode.COMMIT: + result = None + if auto_commit == CommitMode.COMMIT: + retry_exhausted = True + for i in range(num_retries): + with session_factory() as session: + try: + result = f(session, *args, **kwargs) session.commit() - elif auto_commit == CommitMode.FLUSH: - session.flush() - elif auto_commit == CommitMode.ROLLBACK: + if isinstance(result, SQLModel): + session.refresh(result) + retry_exhausted = False + break + except PendingRollbackError as e: + logger.info(str(e)) session.rollback() - return result - except OperationalError: - logger.info(f"Retry {i+1}/{num_retries} after possible DB concurrent update conflict.") - session.rollback() - raise OasstError( - "DATABASE_MAX_RETIRES_EXHAUSTED", - error_code=OasstErrorCode.DATABASE_MAX_RETRIES_EXHAUSTED, - http_status_code=HTTPStatus.SERVICE_UNAVAILABLE, - ) + except OperationalError as e: + if e.orig is not None and isinstance( + e.orig, + (SerializationFailure, DeadlockDetected, UniqueViolation, ExclusionViolation), + ): + logger.info(f"{type(e.orig)} Inner {e.orig.pgcode} {type(e.orig.pgcode)}") + session.rollback() + else: + raise e + logger.info(f"Retry {i+1}/{num_retries}") + if retry_exhausted: + raise OasstError( + "DATABASE_MAX_RETIRES_EXHAUSTED", + error_code=OasstErrorCode.DATABASE_MAX_RETRIES_EXHAUSTED, + http_status_code=HTTPStatus.SERVICE_UNAVAILABLE, + ) + else: + with session_factory() as session: + result = f(session, *args, **kwargs) + if auto_commit == CommitMode.FLUSH: + session.flush() + if isinstance(result, SQLModel): + session.refresh(result) + elif auto_commit == CommitMode.ROLLBACK: + session.rollback() + return result except Exception as e: - logger.error("DB Rollback Failure") + logger.info(str(e)) raise e return wrapped_f diff --git a/backend/oasst_backend/utils/language_classification.py b/backend/oasst_backend/utils/language_classification.py new file mode 100644 index 00000000..ec75b9c3 --- /dev/null +++ b/backend/oasst_backend/utils/language_classification.py @@ -0,0 +1,111 @@ +import os +import pickle +from collections import Counter + +from sklearn import metrics +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline +from sklearn.svm import LinearSVC + + +def load_and_split(foldername, num_words): + ls = os.listdir(foldername) + X = [] + Y = [] + langmap = dict() + for idx, x in enumerate(ls): + print("loading language", x) + with open(foldername + "/" + x, "r") as reader: + tmp = reader.read().split(" ") + tmp = [" ".join(tmp[i : i + num_words]) for i in range(0, 100_000, num_words)] + X.extend(tmp) + Y.extend([idx] * len(tmp)) + langmap[idx] = x + x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.90) + return x_train, x_test, y_train, y_test, langmap + + +def build_and_train_pipeline(x_train, y_train): + vectorizer = TfidfVectorizer(ngram_range=(1, 2), analyzer="char", use_idf=False) + clf = Pipeline( + [ + ("vec", vectorizer), + # ("nystrom", Nystroem(n_components=1000,n_jobs=6)), + ("clf", LinearSVC(C=0.5)), + # ("clf",GaussianNB()) + # ("clf", HistGradientBoostingClassifier()) + ] + ) + print("fitting model...") + clf.fit(x_train, y_train) + return clf + + +def benchmark(clf, x_test, y_test, langmap): + print("benchmarking model...") + y_pred = clf.predict(x_test) + names = list(langmap.values()) + # print(y_test) + # print(langmap) + print(metrics.classification_report(y_test, y_pred, target_names=names)) + cm = metrics.confusion_matrix(y_test, y_pred) + print(cm) + + +def main(foldername, modelname, num_words): + x_train, x_test, y_train, y_test, langmap = load_and_split(foldername=foldername, num_words=num_words) + clf = build_and_train_pipeline(x_train, y_train) + benchmark(clf, x_test, y_test, langmap) + save_model(clf, langmap, num_words, modelname) + model = load(modelname) + print( + "running infernence on long tests", + inference_voter( + model, + """ + What language is this text written in? Nobody knows until you fill in at least ten words. + This test here is to check whether the moving window approach works, + so I still need to fill in a little more text. + """, + ), + ) + + +def load(modelname): + with open(modelname, "rb") as writer: + data = pickle.load(writer) + return data + + +def save_model(model, idx_to_name, num_words, modelname): + out = { + "model": model, + "idx_to_name": idx_to_name, + "num_words": num_words, + } + with open(modelname, "wb") as writer: + pickle.dump(out, writer) + + +def inference_voter(model, text): + tmp = text.split() + # print(len(tmp), tmp) + tmp = [" ".join(tmp[i : i + model["num_words"]]) for i in range(0, len(tmp) - model["num_words"])] + predictions = model["model"].predict(tmp) + # print("integer predictions", predictions) + # print("name predictions", *[model["idx_to_name"][n] for n in predictions]) + result = Counter(predictions).most_common(1)[0][0] + return model["idx_to_name"][result] + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("-m", "--model", help="save location for model and metadata") + parser.add_argument("-d", "--data", help="specify the folder for data files") + parser.add_argument("-n", "--num_words", help="number of words to use for statistics", type=int) + args = parser.parse_args() + # np.set_printoptions(threshold=np.inf) + main(args.data, args.model, args.num_words) diff --git a/backend/oasst_backend/utils/ranking.py b/backend/oasst_backend/utils/ranking.py index 5538d7a3..0bb94fe8 100644 --- a/backend/oasst_backend/utils/ranking.py +++ b/backend/oasst_backend/utils/ranking.py @@ -96,13 +96,15 @@ def ranked_pairs(ranks: List[List[int]]): """ tallies, names = head_to_head_votes(ranks) tallies = tallies - tallies.T - # print(tallies) # note: the resulting tally matrix should be skew-symmetric # order by strength of victory (using tideman's original method, don't think it would make a difference for us) sorted_majorities = [] for i in range(len(ranks[0])): for j in range(len(ranks[0])): - if tallies[i, j] > 0: + # you can never prefer yourself over yourself + # we also have to pick one of the two choices, + # if the preference is exactly zero... + if tallies[i, j] >= 0 and i != j: sorted_majorities.append((i, j, tallies[i, j])) # we don't explicitly deal with tied majorities here sorted_majorities = np.array(sorted(sorted_majorities, key=lambda x: x[2], reverse=True)) @@ -128,13 +130,36 @@ def ranked_pairs(ranks: List[List[int]]): if __name__ == "__main__": - ranks = ( + + ranks = """ ( [("w", "x", "z", "y") for _ in range(1)] + [("w", "y", "x", "z") for _ in range(2)] # + [("x","y","z","w") for _ in range(4)] + [("x", "z", "w", "y") for _ in range(5)] + [("y", "w", "x", "z") for _ in range(1)] # [("y","z","w","x") for _ in range(1000)] - ) + )""" + ranks = [ + [ + ("c5181083-d3e9-41e7-a935-83fb9fa01488"), + ("dcf3d179-0f34-4c15-ae21-b8feb15e422d"), + ("d11705af-5575-43e5-b22e-08d155fbaa62"), + ], + [ + ("d11705af-5575-43e5-b22e-08d155fbaa62"), + ("c5181083-d3e9-41e7-a935-83fb9fa01488"), + ("dcf3d179-0f34-4c15-ae21-b8feb15e422d"), + ], + [ + ("dcf3d179-0f34-4c15-ae21-b8feb15e422d"), + ("c5181083-d3e9-41e7-a935-83fb9fa01488"), + ("d11705af-5575-43e5-b22e-08d155fbaa62"), + ], + [ + ("d11705af-5575-43e5-b22e-08d155fbaa62"), + ("c5181083-d3e9-41e7-a935-83fb9fa01488"), + ("dcf3d179-0f34-4c15-ae21-b8feb15e422d"), + ], + ] rp = ranked_pairs(ranks) print(rp) diff --git a/backend/oasst_backend/utils/tree_export.py b/backend/oasst_backend/utils/tree_export.py new file mode 100644 index 00000000..ee3de9d7 --- /dev/null +++ b/backend/oasst_backend/utils/tree_export.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +import gzip +import json +from collections import defaultdict +from typing import Optional, TextIO + +from fastapi.encoders import jsonable_encoder +from oasst_backend.models import Message +from pydantic import BaseModel + + +class ExportMessageNode(BaseModel): + message_id: str + parent_id: Optional[str] + text: Optional[str] + role: str + review_count: Optional[int] + rank: Optional[int] + replies: Optional[list[ExportMessageNode]] + + @classmethod + def prep_message_export(cls, message: Message) -> ExportMessageNode: + return cls( + message_id=str(message.id), + parent_id=str(message.parent_id) if message.parent_id else None, + text=str(message.payload.payload.text), + role=message.role, + review_count=message.review_count, + rank=message.rank, + ) + + +class ExportMessageTree(BaseModel): + message_tree_id: str + replies: Optional[ExportMessageNode] + + +def build_export_tree(message_tree_id: str, messages: list[Message]) -> ExportMessageTree: + export_tree = ExportMessageTree(message_tree_id=str(message_tree_id)) + export_tree_data = [ExportMessageNode.prep_message_export(m) for m in messages] + + message_parents = defaultdict(list) + for message in export_tree_data: + message_parents[message.parent_id].append(message) + + def build_tree(tree: dict, parent: Optional[str], messages: list[Message]): + children = message_parents[parent] + tree.replies = children + + for idx, child in enumerate(tree.replies): + build_tree(tree.replies[idx], child.message_id, messages) + + build_tree(export_tree, None, export_tree_data) + + return export_tree + + +def write_trees_to_file(file, trees: list[ExportMessageTree], use_compression: bool = True) -> None: + + out_buff: TextIO + if use_compression: + out_buff = gzip.open(file, "wt", encoding="UTF-8") + else: + out_buff = open(file, "wt", encoding="UTF-8") + + with out_buff as f: + for tree in trees: + file_data = jsonable_encoder(tree) + json.dump(file_data, f) + f.write("\n") diff --git a/inference/README.md b/inference/README.md index 3dee94f9..6e1da2c7 100644 --- a/inference/README.md +++ b/inference/README.md @@ -2,7 +2,13 @@ Preliminary implementation of the inference engine for OpenAssistant. -## Development (you'll need multiple terminals) +## Development Variant 1 (you'll need tmux) + +Run `./full-dev-setup.sh` to start the full development setup. Make sure to wait +until the 2nd terminal is ready and says `{"message":"Connected"}` before +entering input into the last terminal. + +## Development Variant 2 (you'll need multiple terminals) Run a redis container (or use the one of the general docker compose file): @@ -26,6 +32,13 @@ pip install -r requirements.txt python __main__.py ``` +For the worker, you'll also want to have the text-generation-inference server +running: + +```bash +docker run --rm -it -p 8001:80 -e MODEL_ID=distilgpt2 ykilcher/text-generation-inference +``` + Run the client: ```bash diff --git a/inference/full-dev-setup.sh b/inference/full-dev-setup.sh new file mode 100755 index 00000000..98a5b173 --- /dev/null +++ b/inference/full-dev-setup.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# Creates a tmux window with splits for the individual services + +tmux new-session -d -s "inference-dev-setup" +tmux send-keys "docker run --rm -it -p 6379:6379 redis" C-m +tmux split-window -h +tmux send-keys "docker run --rm -it -p 8001:80 -e MODEL_ID=distilgpt2 ykilcher/text-generation-inference" C-m +tmux split-window -h +tmux send-keys "cd server" C-m +tmux send-keys "uvicorn main:app --reload" C-m +tmux split-window -h +tmux send-keys "cd worker" C-m +tmux send-keys "python __main__.py" C-m +tmux split-window -h +tmux send-keys "cd text-client" C-m +tmux send-keys "sleep 5" C-m +tmux send-keys "python __main__.py" C-m +tmux attach-session -t "inference-dev-setup" diff --git a/inference/server/main.py b/inference/server/main.py index f3ec02b1..4cb5f659 100644 --- a/inference/server/main.py +++ b/inference/server/main.py @@ -5,6 +5,7 @@ import uuid import fastapi import pydantic import redis.asyncio as redis +import websockets.exceptions from fastapi.middleware.cors import CORSMiddleware from loguru import logger from oasst_shared.schemas import inference, protocol @@ -63,6 +64,7 @@ class MessageRequestState(str, enum.Enum): pending = "pending" in_progress = "in_progress" complete = "complete" + aborted_by_worker = "aborted_by_worker" class DbChatEntry(pydantic.BaseModel): @@ -154,40 +156,56 @@ async def create_message(id: str, message_request: MessageRequest, fastapi_reque async def work(websocket: fastapi.WebSocket): await websocket.accept() worker_config = inference.WorkerConfig.parse_raw(await websocket.receive_text()) - while True: - # find a pending task that matches the worker's config - # could also be implemented using task queues - # but general compatibility matching is tricky - for chat in CHATS.values(): - if (request := chat.pending_message_request) is not None: - if chat.message_request_state == MessageRequestState.pending: - if request.compatible_with(worker_config): + try: + while True: + print(websocket.client_state) + if websocket.client_state == fastapi.websockets.WebSocketState.DISCONNECTED: + logger.warning("Worker disconnected") + break + # find a pending task that matches the worker's config + # could also be implemented using task queues + # but general compatibility matching is tricky + for chat in CHATS.values(): + if (request := chat.pending_message_request) is not None: + if chat.message_request_state == MessageRequestState.pending: + if request.compatible_with(worker_config): + break + else: + logger.debug("No pending tasks") + await asyncio.sleep(1) + continue + + chat.message_request_state = MessageRequestState.in_progress + + work_request = inference.WorkRequest( + conversation=chat.conversation, + model_name=request.model_name, + max_new_tokens=request.max_new_tokens, + ) + + logger.info(f"Created {work_request}") + try: + await websocket.send_text(work_request.json()) + except websockets.exceptions.ConnectionClosedError: + logger.warning("Worker disconnected") + websocket.close() + chat.message_request_state = MessageRequestState.pending + break + + try: + while True: + # maybe unnecessary to parse and re-serialize + # could just pass the raw string and mark end via empty string + response_packet = inference.WorkResponsePacket.parse_raw(await websocket.receive_text()) + await redisClient.rpush(chat.id, response_packet.json()) + if response_packet.is_end: break - else: - logger.debug("No pending tasks") - await asyncio.sleep(1) - continue + except fastapi.WebSocketException: + # TODO: handle this better + logger.exception(f"Websocket closed during handling of {chat.id}") + chat.message_request_state = MessageRequestState.aborted_by_worker + raise - chat.message_request_state = MessageRequestState.in_progress - - work_request = inference.WorkRequest( - conversation=chat.conversation, - model_name=request.model_name, - max_new_tokens=request.max_new_tokens, - ) - - logger.info(f"Created {work_request}") - try: - await websocket.send_text(work_request.json()) - while True: - # maybe unnecessary to parse and re-serialize - # could just pass the raw string and mark end via empty string - response_packet = inference.WorkResponsePacket.parse_raw(await websocket.receive_text()) - await redisClient.rpush(chat.id, response_packet.json()) - if response_packet.is_end: - break - except fastapi.WebSocketException: - # TODO: handle this better - logger.exception(f"Websocket closed during handling of {chat.id}") - - chat.message_request_state = MessageRequestState.complete + chat.message_request_state = MessageRequestState.complete + except fastapi.WebSocketException: + logger.exception("Websocket closed") diff --git a/inference/worker/__main__.py b/inference/worker/__main__.py index ad5e5cef..96fe164a 100644 --- a/inference/worker/__main__.py +++ b/inference/worker/__main__.py @@ -1,13 +1,12 @@ -import re -import time +import json import rel -import torch +import requests +import sseclient import typer import websocket from loguru import logger from oasst_shared.schemas import inference, protocol -from transformers import pipeline app = typer.Typer() @@ -16,12 +15,13 @@ app = typer.Typer() def main( backend_url: str = "ws://localhost:8000", model_name: str = "distilgpt2", + inference_server_url: str = "http://localhost:8001", ): - pipe = pipeline("text-generation", model=model_name) - def on_open(ws: websocket.WebSocket): + logger.info("Connected to backend, sending config...") worker_config = inference.WorkerConfig(model_name=model_name) ws.send(worker_config.json()) + logger.info("Config sent, waiting for work...") def on_message(ws: websocket.WebSocket, message: str): # TODO: what if this comes in, but one is already in progress? @@ -35,25 +35,39 @@ def main( # construct prompt messages = [_prepare_message(message) for message in work_request.conversation.messages] - prompt = "\n".join(messages) + "\nAssistant:" + prefix = ( + "The following is a conversation between a user and an assistant. " + "The assistant is helpful, creative, clever, and very friendly.\n" + "Assistant: Hello! How can I help you today?\n" + ) - # TODO: replace this with incremental generation - torch.manual_seed(work_request.seed) - model_output = pipe(prompt, max_new_tokens=work_request.max_new_tokens, do_sample=True, return_full_text=False)[ - 0 - ]["generated_text"] - model_output = model_output.strip() + prompt = prefix + "\n".join(messages) + "\nAssistant:" - # fake streaming - split_idcs = [m.start() for m in re.finditer(r"([\w:]+)", model_output)] - pieces = [model_output[a:b] for a, b in zip([0] + split_idcs, split_idcs + [None])] - for piece in pieces: - if not piece: - continue - if piece.strip() in ("User:", "Assistant:"): + response = requests.post( + f"{inference_server_url}/generate_stream", + json={ + "inputs": prompt, + "parameters": { + "max_new_tokens": work_request.max_new_tokens, + "do_sample": work_request.do_sample, + "top_k": work_request.top_k, + "top_p": work_request.top_p, + "temperature": work_request.temperature, + "seed": work_request.seed, + }, + }, + stream=True, + headers={"Accept": "text/event-stream"}, + ) + response.raise_for_status() + + client = sseclient.SSEClient(response) + for event in client.events(): + data = json.loads(event.data) + if data["is_end"]: break - ws.send(inference.WorkResponsePacket(token=piece).json()) - time.sleep(0.1) + intermediate = data["event"] + ws.send(inference.WorkResponsePacket(token=intermediate["token"]).json()) ws.send(inference.WorkResponsePacket(is_end=True).json()) def on_error(ws: websocket.WebSocket, error: Exception): diff --git a/inference/worker/requirements.txt b/inference/worker/requirements.txt index c248c652..82169379 100644 --- a/inference/worker/requirements.txt +++ b/inference/worker/requirements.txt @@ -1,6 +1,6 @@ loguru rel -torch -transformers +requests +sseclient-py typer websocket-client diff --git a/model/supervised_finetuning/custom_datasets/__init__.py b/model/supervised_finetuning/custom_datasets/__init__.py index 2e1e4b30..558ec502 100644 --- a/model/supervised_finetuning/custom_datasets/__init__.py +++ b/model/supervised_finetuning/custom_datasets/__init__.py @@ -43,7 +43,7 @@ def get_one_dataset(conf, dataset_name): if dataset_name == "debate_sum": train, eval = train_val_dataset(train, val_split=0.2) else: - val_name = "validation" if dataset_name not in ["billsum"] else "test" + val_name = "validation" if dataset_name not in ["billsum", "tldr_news"] else "test" eval = SummarizationDataset(dataset_name, conf.cache_dir, val_name) elif "ted_trans" in dataset_name: language_pair = dataset_name.split("_")[-1] diff --git a/model/supervised_finetuning/custom_datasets/dialogue_collator.py b/model/supervised_finetuning/custom_datasets/dialogue_collator.py index 719fa0d6..c96ed576 100644 --- a/model/supervised_finetuning/custom_datasets/dialogue_collator.py +++ b/model/supervised_finetuning/custom_datasets/dialogue_collator.py @@ -3,7 +3,6 @@ from typing import Optional, Union import numpy as np import torch -from custom_datasets.qa_datasets import QA_SPECIAL_TOKENS from torch.nn import functional as F from transformers.tokenization_utils_base import PaddingStrategy, PreTrainedTokenizerBase @@ -23,15 +22,8 @@ class DialogueDataCollator: flatten_messages = [] label_masks = [] - for feature_one in features: - assert len(feature_one) % 2 == 0, "Number of messages must be even" - # TODO: we should push this to dataset __getitem__ - messages = [ - (QA_SPECIAL_TOKENS["Question"] if i % 2 == 0 else "") - + x - + (QA_SPECIAL_TOKENS["Answer"] if i % 2 == 0 else "") - for i, x in enumerate(feature_one) - ] + for messages in features: + messages = list(messages) # Add a way for the model to terminate generation # When we predict the start of a new expected question, we want to be able to stop generation diff --git a/model/supervised_finetuning/custom_datasets/formatting.py b/model/supervised_finetuning/custom_datasets/formatting.py new file mode 100644 index 00000000..a6c1c0d8 --- /dev/null +++ b/model/supervised_finetuning/custom_datasets/formatting.py @@ -0,0 +1,5 @@ +QA_SPECIAL_TOKENS = {"Question": "", "Answer": "", "StartPrefix": "", "EndPrefix": ""} + + +def format_pair(pair): + return "{}{}{}".format(QA_SPECIAL_TOKENS["Question"], pair[0], QA_SPECIAL_TOKENS["Answer"]), pair[1] diff --git a/model/supervised_finetuning/custom_datasets/prompt_dialogue.py b/model/supervised_finetuning/custom_datasets/prompt_dialogue.py index 4a1d83a3..1c823934 100644 --- a/model/supervised_finetuning/custom_datasets/prompt_dialogue.py +++ b/model/supervised_finetuning/custom_datasets/prompt_dialogue.py @@ -2,6 +2,7 @@ import json import os from urllib.request import urlopen +from custom_datasets.formatting import format_pair from torch.utils.data import Dataset @@ -49,8 +50,7 @@ class PromptGeneratedDataset(Dataset): return len(self.pairs) def __getitem__(self, index): - question, answer = self.pairs[index] - return question, answer + return format_pair(self.pairs[index]) class InstructionTuning(Dataset): @@ -101,5 +101,4 @@ class InstructionTuning(Dataset): return len(self.pairs) def __getitem__(self, index): - question, answer = self.pairs[index] - return question, answer + return format_pair(self.pairs[index]) diff --git a/model/supervised_finetuning/custom_datasets/qa_datasets.py b/model/supervised_finetuning/custom_datasets/qa_datasets.py index 7d9c7f48..47b1c247 100644 --- a/model/supervised_finetuning/custom_datasets/qa_datasets.py +++ b/model/supervised_finetuning/custom_datasets/qa_datasets.py @@ -7,14 +7,13 @@ import re from urllib.request import urlopen import numpy as np +from custom_datasets.formatting import QA_SPECIAL_TOKENS, format_pair from datasets import load_dataset from torch.utils.data import Dataset # @agoryuno contributed this re_reference_remove = re.compile(r"\[\d+(?:,\s*\d+)*?\]") -QA_SPECIAL_TOKENS = {"Question": "", "Answer": "", "StartPrefix": "", "EndPrefix": ""} - def index_squad_v2(example): if len(example["answers"]["text"]): @@ -78,7 +77,7 @@ class QADataset(Dataset): def __getitem__(self, idx): data = self.dataset[idx] - return self.index_fn(data) + return format_pair(self.index_fn(data)) class WebGPT(Dataset): @@ -111,7 +110,7 @@ class WebGPT(Dataset): def __getitem__(self, index): question = self.index2question[index] answer = self.questions[question] - return [question, answer] + return format_pair((question, answer)) class SODA(Dataset): @@ -121,14 +120,14 @@ class SODA(Dataset): def process_soda_convo(self, data): pairs = [] play_as = data["speakers"][1] - prefix = "{}{}. {}{}".format( - QA_SPECIAL_TOKENS["StartPrefix"], - data["narrative"], - "your name {}".format(play_as), - QA_SPECIAL_TOKENS["EndPrefix"], - ) question, answer = "", "" prefix, postfix = "", "" + dialogue_bg = "{}{} {}{}".format( + QA_SPECIAL_TOKENS["StartPrefix"], + data["narrative"], + "your are {}".format(play_as), + QA_SPECIAL_TOKENS["EndPrefix"], + ) previous_chat = [] for idx, convo in enumerate(data["dialogue"]): @@ -138,14 +137,20 @@ class SODA(Dataset): else: answer = convo postfix = data["speakers"][idx] + if len(question) and len(answer) and prefix != postfix and postfix == play_as: history = "".join( - ["{}{}{}".format(p[0], QA_SPECIAL_TOKENS["Answer"], p[1]) for p in previous_chat] + [ + "{}{}{}{}".format(QA_SPECIAL_TOKENS["Question"], p[0], QA_SPECIAL_TOKENS["Answer"], p[1]) + for p in previous_chat + ] ) if len(history): history += "" - pairs.append((prefix + history + question, answer)) + prompt = QA_SPECIAL_TOKENS["Question"] + question + QA_SPECIAL_TOKENS["Answer"] + pairs.append((dialogue_bg + history + prompt, answer)) previous_chat.append((question, answer)) + return pairs def __init__(self, cache_dir, max_sample_size=10000, input_max_length=1024) -> None: @@ -166,8 +171,8 @@ class SODA(Dataset): return len(self.pairs) def __getitem__(self, index): - question, answer = self.pairs[index] - return question, answer + # special token added during preprocess + return self.pairs[index] class SODADialogue(Dataset): @@ -218,7 +223,7 @@ class SODADialogue(Dataset): return len(self.pairs) def __getitem__(self, index): - return self.pairs[index] + return format_pair(self.pairs[index]) class JokeExplaination(Dataset): @@ -253,8 +258,7 @@ class JokeExplaination(Dataset): return len(self.pairs) def __getitem__(self, index): - question, answer = self.pairs[index] - return question, answer + return format_pair(self.pairs[index]) # https://huggingface.co/datasets/aquamuse diff --git a/model/supervised_finetuning/custom_datasets/summarization.py b/model/supervised_finetuning/custom_datasets/summarization.py index 2a097fe7..85d21a27 100644 --- a/model/supervised_finetuning/custom_datasets/summarization.py +++ b/model/supervised_finetuning/custom_datasets/summarization.py @@ -3,6 +3,7 @@ """ import random +from custom_datasets.formatting import format_pair from datasets import load_dataset from torch.utils.data import Dataset @@ -54,11 +55,12 @@ def index_summary_merge(text, summary): class SummarizationDataset(Dataset): - def __init__(self, dataset, cache_dir, split): + def __init__(self, dataset, cache_dir, split, max_words=512): self.name = dataset self.dataset = load_dataset(*summarization_config_mapping[dataset], cache_dir=cache_dir, split=split) self.text_column, self.summary_column = summarization_name_mapping[dataset] self.preprocess_fn = index_summary_merge if dataset == "scitldr" else index_summary_default + self.max_words = max_words def __len__(self): return len(self.dataset) @@ -72,4 +74,5 @@ class SummarizationDataset(Dataset): else: prompt = random.choice(SUMMARIZATION_SPECIAL_TOKENS["Summary"]) - return ("".join([SUMMARIZATION_SPECIAL_TOKENS["Text"], " ".join(text.split(" ")[:256]), prompt]), summary) + context = "".join([SUMMARIZATION_SPECIAL_TOKENS["Text"], " ".join(text.split(" ")[: self.max_words]), prompt]) + return format_pair((context, summary)) diff --git a/model/supervised_finetuning/custom_datasets/toxic_conversation.py b/model/supervised_finetuning/custom_datasets/toxic_conversation.py index 815ac722..640b8d8d 100644 --- a/model/supervised_finetuning/custom_datasets/toxic_conversation.py +++ b/model/supervised_finetuning/custom_datasets/toxic_conversation.py @@ -4,12 +4,13 @@ """ import random +from custom_datasets.formatting import QA_SPECIAL_TOKENS, format_pair from datasets import load_dataset from torch.utils.data import Dataset class ProsocialDialogueExplaination(Dataset): - name = "prosocial_explain" + name = "explain_prosocial" TEMPLATE = [ # 0 : reply or sentence of interest, 1 : reason of caution ("'{0}' Why is this sentence {1}", "This sentence is {1} because {0}"), @@ -36,7 +37,7 @@ class ProsocialDialogueExplaination(Dataset): return len(self.pairs) def __getitem__(self, idx): - return self.pairs[idx] + return format_pair(self.pairs[idx]) class ProsocialDialogue(Dataset): @@ -58,8 +59,9 @@ class ProsocialDialogue(Dataset): dataset = load_dataset("allenai/prosocial-dialog", cache_dir=cache_dir)[split] self.pairs = [] for row in dataset: + prompt = QA_SPECIAL_TOKENS["Question"] + row["context"] + QA_SPECIAL_TOKENS["Answer"] for answer in row["rots"]: - self.pairs.append((self.PREFIX + row["context"], answer)) + self.pairs.append((self.PREFIX + prompt, answer)) def __len__(self): return len(self.pairs) diff --git a/model/supervised_finetuning/custom_datasets/translation.py b/model/supervised_finetuning/custom_datasets/translation.py index 694d31ce..f9a71a8e 100644 --- a/model/supervised_finetuning/custom_datasets/translation.py +++ b/model/supervised_finetuning/custom_datasets/translation.py @@ -8,6 +8,7 @@ """ import random +from custom_datasets.formatting import format_pair from datasets import load_dataset from torch.utils.data import Dataset @@ -82,7 +83,7 @@ class TranslationPair(Dataset): return len(self.pairs) def __getitem__(self, index): - return self.pairs[index] + return format_pair(self.pairs[index]) class WMT2019(TranslationPair): @@ -99,6 +100,8 @@ class WMT2019(TranslationPair): else: # translating in reverse direction source = random.choice(TRANSLATION_PROMPT[src]).format(row[tgt]) self.pairs.append((source, row[src])) + if len(self.pairs) > 100000: + break class DiveMT(TranslationPair): diff --git a/model/supervised_finetuning/tests/test_datasets.py b/model/supervised_finetuning/tests/test_datasets.py index 3b59f289..8d5ad08f 100644 --- a/model/supervised_finetuning/tests/test_datasets.py +++ b/model/supervised_finetuning/tests/test_datasets.py @@ -7,8 +7,8 @@ from custom_datasets.dialogue_collator import DialogueDataCollator def test_all_datasets(): qa_base = QA_DATASETS summarize_base = SUMMARIZATION_DATASETS - others = ["prompt_dialogue", "webgpt", "soda", "joke", "instruct_tuning"] - translation = ["dive_mt", "wmt2019_zh-en", "wmt2019_ru-en", "wmt2019_de-en", "ted_trans_de-ja", "ted_trans_nl-en"] + others = ["prompt_dialogue", "webgpt", "soda", "joke", "instruct_tuning", "explain_prosocial", "prosocial_dialogue"] + translation = ["dive_mt", "wmt2019_zh-en", "wmt2019_ru-en", "ted_trans_de-ja", "ted_trans_nl-en"] config = Namespace(cache_dir=".cache") for dataset_name in translation + others + summarize_base + qa_base: @@ -31,7 +31,6 @@ def test_collate_fn(): qa_base = QA_DATASETS summarize_base = SUMMARIZATION_DATASETS others = ["prompt_dialogue", "webgpt", "soda", "joke", "gsm8k"] - trains, evals = [], [] for dataset_name in others + qa_base + summarize_base: print(dataset_name) @@ -41,10 +40,10 @@ def test_collate_fn(): dataloader = DataLoader(ConcatDataset(trains), collate_fn=collate_fn, batch_size=128) for batch in dataloader: - # print(batch.keys()) - # print(tokenizer.decode(batch['input_ids'][0])) - # print('-----') - # print(tokenizer.decode(batch['targets'][0][batch['label_masks'][0]])) + print(batch.keys()) + print(tokenizer.decode(batch["input_ids"][0])) + print("-----") + print(tokenizer.decode(batch["targets"][0][batch["label_masks"][0]])) assert batch["targets"].shape[1] <= 512 dataloader = DataLoader(ConcatDataset(evals), collate_fn=collate_fn, batch_size=128) for batch in dataloader: diff --git a/model/supervised_finetuning/utils.py b/model/supervised_finetuning/utils.py index 7b6e03b6..f7a0ab15 100644 --- a/model/supervised_finetuning/utils.py +++ b/model/supervised_finetuning/utils.py @@ -25,6 +25,10 @@ def get_tokenizer(conf): tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token, "sep_token": "<|extratoken_100|>"}) elif "codegen" in conf.model_name: tokenizer.add_special_tokens({"pad_token": "<|endoftext|>", "sep_token": "<|endoftext|>"}) + elif "pythia" in conf.model_name: + tokenizer.add_special_tokens( + {"pad_token": "<|padding|>", "sep_token": "<|endoftext|>", "eos_token": "<|endoftext|>"} + ) additional_special_tokens = ( [] diff --git a/notebooks/data-augmentation/changemyview-builder/README.md b/notebooks/data-augmentation/changemyview-builder/README.md new file mode 100644 index 00000000..62ab9b2e --- /dev/null +++ b/notebooks/data-augmentation/changemyview-builder/README.md @@ -0,0 +1,37 @@ +# README + +## Introduction + +This program converts data obtained from the subreddit r/changemyview into a cleaner format for further data processing. The data is not clean enough to be used directly in a model yet, and additional preprocessing is required. + +## Data Format + +The cleaned data is stored in an Apache Parquet file with the following columns: + +| Column Name | Description | Data Type | +|-------------|------------------------------------------------------------------------|----------------| +| INSTRUCTION | Post title + body text | String | +| RESPONSE | Body text of comments attempting to change OP's mind of `INSTRUCTION`. | List\ | +| SOURCE | Permalink to the reddit post | String | +| METADATA | Metadata related to `RESPONSE`. | Dict\ | + +### Metadata +Currently, metadata is only broken into one category: +- `detoxify_labels`- A Dictionary of values outputted by the [Unitaryai Detoxifier](https://github.com/unitaryai/detoxify) model, fitted to every comment under any given post. + +## Usage + +To use the program, follow these instructions: + +1. **Clone the repository** - `git clone https://github.com/LAION-AI/Open-Assistant.git` +2. **Navigate to the project directory** - `cd notebooks/data-augmentation/changemyview-builder` +3. **Open the Jupyter Notebook** - `jupyter notebook data_processor.ipynb` +4. **Run the program** - Go through the notebook and run the cells + +## Contributing + +If you would like to contribute to this project, please fork the repository and submit a pull request with your changes. + +## License + +This project is licensed under the Apache-2.0 License - see the [LICENSE](LICENSE) file for details. \ No newline at end of file diff --git a/notebooks/data-augmentation/changemyview-builder/data_processor.ipynb b/notebooks/data-augmentation/changemyview-builder/data_processor.ipynb new file mode 100644 index 00000000..03155d40 --- /dev/null +++ b/notebooks/data-augmentation/changemyview-builder/data_processor.ipynb @@ -0,0 +1,577 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# r/ChangeMyView data converter\n", + "Converts subreddit data into readable format for ML training\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/data-augmentation/changemyview-builder/data_processor.ipynb)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 65, + "outputs": [], + "source": [ + "### REMEMBER: setup the .env before running this code!\n", + "\n", + "\"\"\"CONSTANTS\"\"\"\n", + "\n", + "# Set the head number to the amount of entries you want to load in minus one\n", + "ENTRIES_COUNT = 10\n", + "\n", + "# Set the threshold for toxic comments to be removed\n", + "TOXIC_THRESHOLD = 0.95" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 66, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pandas in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (1.4.4)\r\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from pandas) (2.8.2)\r\n", + "Requirement already satisfied: pytz>=2020.1 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from pandas) (2022.1)\r\n", + "Requirement already satisfied: numpy>=1.18.5 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from pandas) (1.21.5)\r\n", + "Requirement already satisfied: six>=1.5 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)\r\n", + "Requirement already satisfied: praw in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (7.6.1)\r\n", + "Requirement already satisfied: websocket-client>=0.54.0 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from praw) (0.58.0)\r\n", + "Requirement already satisfied: update-checker>=0.18 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from praw) (0.18.0)\r\n", + "Requirement already satisfied: prawcore<3,>=2.1 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from praw) (2.3.0)\r\n", + "Requirement already satisfied: requests<3.0,>=2.6.0 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from prawcore<3,>=2.1->praw) (2.28.1)\r\n", + "Requirement already satisfied: six in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from websocket-client>=0.54.0->praw) (1.16.0)\r\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (3.3)\r\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (2022.9.24)\r\n", + "Requirement already satisfied: charset-normalizer<3,>=2 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (2.0.4)\r\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (1.26.11)\r\n", + "Requirement already satisfied: python-dotenv in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (0.21.0)\r\n", + "Requirement already satisfied: pyarrow in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (10.0.1)\r\n", + "Requirement already satisfied: numpy>=1.16.6 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from pyarrow) (1.21.5)\r\n", + "Requirement already satisfied: detoxify in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (0.5.1)\r\n", + "Requirement already satisfied: transformers==4.22.1 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from detoxify) (4.22.1)\r\n", + "Requirement already satisfied: torch>=1.7.0 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from detoxify) (1.13.1)\r\n", + "Requirement already satisfied: sentencepiece>=0.1.94 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from detoxify) (0.1.97)\r\n", + "Requirement already satisfied: huggingface-hub<1.0,>=0.9.0 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from transformers==4.22.1->detoxify) (0.11.1)\r\n", + "Requirement already satisfied: regex!=2019.12.17 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from transformers==4.22.1->detoxify) (2022.7.9)\r\n", + "Requirement already satisfied: pyyaml>=5.1 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from transformers==4.22.1->detoxify) (6.0)\r\n", + "Requirement already satisfied: tqdm>=4.27 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from transformers==4.22.1->detoxify) (4.64.1)\r\n", + "Requirement already satisfied: tokenizers!=0.11.3,<0.13,>=0.11.1 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from transformers==4.22.1->detoxify) (0.12.1)\r\n", + "Requirement already satisfied: filelock in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from transformers==4.22.1->detoxify) (3.6.0)\r\n", + "Requirement already satisfied: packaging>=20.0 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from transformers==4.22.1->detoxify) (21.3)\r\n", + "Requirement already satisfied: numpy>=1.17 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from transformers==4.22.1->detoxify) (1.21.5)\r\n", + "Requirement already satisfied: requests in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from transformers==4.22.1->detoxify) (2.28.1)\r\n", + "Requirement already satisfied: typing-extensions in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (4.3.0)\r\n", + "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from packaging>=20.0->transformers==4.22.1->detoxify) (3.0.9)\r\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from requests->transformers==4.22.1->detoxify) (3.3)\r\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from requests->transformers==4.22.1->detoxify) (1.26.11)\r\n", + "Requirement already satisfied: charset-normalizer<3,>=2 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from requests->transformers==4.22.1->detoxify) (2.0.4)\r\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (from requests->transformers==4.22.1->detoxify) (2022.9.24)\r\n", + "Requirement already satisfied: tqdm in /Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages (4.64.1)\r\n" + ] + } + ], + "source": [ + "# Install any dependencies\n", + "!pip install pandas\n", + "!pip install praw\n", + "!pip install python-dotenv\n", + "!pip install pyarrow\n", + "!pip install detoxify\n", + "!pip install tqdm" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import praw\n", + "import os\n", + "from os.path import join, dirname\n", + "from dotenv import main\n", + "\n", + "# Make sure you create a .env file and fill in all the necessary information in the same folder as this script!\n", + "main.load_dotenv(join(dirname(os.path.realpath('__file__')), '.env'))\n", + "\n", + "reddit = praw.Reddit(\n", + " client_id=os.environ.get(\"CLIENT_ID\"),\n", + " client_secret=os.environ.get(\"CLIENT_SECRET\"),\n", + " user_agent=\"CMV_Scraper\",\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "outputs": [], + "source": [ + "# load the data\n", + "import tarfile\n", + "import os.path\n", + "import json\n", + "import re\n", + "from bz2 import BZ2File\n", + "from urllib import request\n", + "from io import BytesIO\n", + "\n", + "import numpy as np\n", + "\n", + "\n", + "fname = \"cmv.tar.bz2\"\n", + "url = \"https://chenhaot.com/data/cmv/\" + fname\n", + "\n", + "# download if not exists\n", + "if not os.path.isfile(fname):\n", + " f = BytesIO()\n", + " with request.urlopen(url) as resp, open(fname, 'wb') as f_disk:\n", + " data = resp.read()\n", + " f_disk.write(data) # save to disk too\n", + " f.write(data)\n", + " f.seek(0)\n", + "else:\n", + " f = open(fname, 'rb')\n", + "\n", + "\n" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 69, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/kayjaymac/opt/anaconda3/lib/python3.9/bz2.py:124: ResourceWarning: unclosed file <_io.BufferedReader name='cmv.tar.bz2'>\n", + " self._buffer = None\n", + "ResourceWarning: Enable tracemalloc to get the object allocation traceback\n" + ] + } + ], + "source": [ + "#tar = tarfile.open(fileobj=f, mode=\"r:bz2\")\n", + "tar = tarfile.open(fileobj=f, mode=\"r\")\n", + "\n", + "# Extract the file we are interested in\n", + "\n", + "train_fname = \"op_task/train_op_data.jsonlist.bz2\"\n", + "test_fname = \"op_task/heldout_op_data.jsonlist.bz2\"\n", + "\n", + "train_bzlist = tar.extractfile(train_fname)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 70, + "outputs": [], + "source": [ + "# Deserialize the JSON list\n", + "original_posts_train = [\n", + " json.loads(line.decode('utf-8'))\n", + " for line in BZ2File(train_bzlist)\n", + "]" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 71, + "outputs": [ + { + "data": { + "text/plain": "[{'title': \"CMV: I shouldn't get a job in this economic climate because it'll be automated anyway; I should just wait for a post-scarcity utopia.\",\n 'delta_label': False,\n 'name': 't3_2rpsl8',\n 'selftext': \"I think the world is automating fast enough that a utopia will arise where no one will have to work anymore. Within the next 2 decades or so, having a job won't mean much, and most people will be artists and scientists. \\n\\nMy parents let me live with them, so I can just wait until the utopia happens.\\n\\nCMV.\"}]" + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "original_posts_train[:1]" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 72, + "outputs": [], + "source": [ + "# Load the jsonlist file into a dataframe\n", + "#df = pd.read_json(original_posts_train, orient='list', lines=True)\n", + "df = pd.DataFrame(original_posts_train)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 73, + "outputs": [], + "source": [ + "# Function to check if the posts still exists on reddit\n", + "def try_get_post(post_id):\n", + " try:\n", + " submission = reddit.submission(id=post_id)\n", + " submission.name\n", + " return True\n", + " except Exception as e:\n", + " return False" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 74, + "outputs": [], + "source": [ + "# Set up the detoxifier model:\n", + "from detoxify import Detoxify" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "# Removes > sign and the template message at the end of a message\n", + "def cleanup_body_text(cmv_post):\n", + " lines = [line for line in cmv_post.splitlines()\n", + " if not line.lstrip().startswith(\">\")\n", + " and not line.lstrip().startswith(\"____\")\n", + " and not line.lstrip().startswith(\"So go forth and CMV, noble redditors!\")\n", + " and \"edit\" not in \" \".join(line.lower().split()[:2])\n", + " ]\n", + " return \"\\n\".join(lines)\n", + "\n", + "\n", + "\n", + "\n", + "# Create the function that will be handling all the data gathering\n", + "def get_top_comment_and_clean_data(post_id):\n", + " #print(post_id.lstrip(\"t3_\"))\n", + " last_author = \"\"\n", + " # Grab the post\n", + " submission = reddit.submission(id=post_id.lstrip(\"t3_\"))\n", + " #print(submission.title)\n", + "\n", + " # Grab the highest rated comment on root layer\n", + " submission.submission_type = 'best'\n", + " submission.comments.replace_more(limit=0)\n", + " replies = list(submission.comments)[0].replies.list()\n", + "\n", + " # Just some variables\n", + " pros = []\n", + "\n", + " # If the post author doesn't exist this submission was deleted (submission.deleted doesn't work)\n", + " if type(submission.author) == type(None):\n", + " last_author = \"[deleted]\"\n", + " else:\n", + " last_author = submission.author.name\n", + "\n", + " is_pro_argument = False\n", + "\n", + " for comment in replies:\n", + "\n", + " # If redditor object doesn't exist, the account is invalid/deleted\n", + " if type(comment.author) != type(None):\n", + " author = comment.author.name\n", + " else:\n", + " author = \"[deleted]\"\n", + "\n", + " # Assume that whenever the user changes, they are countering the previous person\n", + " if author != last_author:\n", + " is_pro_argument = !is_pro_argument\n", + "\n", + " if author == \"[deleted]\" or author==\"DeltaBot\":\n", + " #print(\"Skipping comment...\")\n", + " continue\n", + "\n", + " # Remove meta and duplicate comments\n", + " comment.body = \" \".join([line for line in comment.body.splitlines()\n", + " if not re.search(r\"(?i)(Change\\smy\\sview|CMV)\", line)\n", + " and line not in pros # Why doesn't this line work\n", + " ])\n", + "\n", + " # Sometimes for some reason duplicate entries exist\n", + " # Also remove automated message with \"Δ\" in it\n", + "\n", + " if comment.body in pros:\n", + " #print(\"Skipping duplicate entry\")\n", + " continue\n", + "\n", + " #print(\"\\t\\t>>\\t\",comment.body)\n", + "\n", + " # Remove toxic comments\n", + " if Detoxify(\"multilingual\").predict(comment.body)[\"toxicity\"] > TOXIC_THRESHOLD:\n", + " #print(\"Identified toxic comment, ignoring...\")\n", + " comment.body = \"\"\n", + "\n", + " # Add to the respective argument type \n", + " if is_pro_argument:\n", + " pros.append(comment.body)\n", + " \n", + " last_author = comment.author.name\n", + " \n", + " # Pros = arguments for the Title of this post\n", + " # Cons = arguments against the title of this post\n", + "\n", + " pros.append(comment.body)\n", + " return pros" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading in 10 posts\n" + ] + } + ], + "source": [ + "print(f\"Loading in {ENTRIES_COUNT} posts\")\n", + "dataset = df.head(ENTRIES_COUNT)\n" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 77, + "outputs": [], + "source": [ + "# the name column does some weird sh** because dataframes already have a name property, so migrate to a different column name\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "dataset[\"post_id\"] = dataset[\"name\"]\n", + "warnings.filterwarnings('default')" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading in data... This will take a while.\n" + ] + }, + { + "data": { + "text/plain": " 0%| | 0/10 [00:00:29: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "/Users/kayjaymac/opt/anaconda3/lib/python3.9/site-packages/torch/serialization.py:997: ResourceWarning: unclosed file <_io.BufferedReader name='cmv.tar.bz2'>\n", + " storage = zip_file.get_storage_from_record(name, numel, torch._UntypedStorage).storage()._untyped()\n", + "ResourceWarning: Enable tracemalloc to get the object allocation traceback\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 7min 49s, sys: 2min 29s, total: 10min 19s\n", + "Wall time: 8min 45s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from tqdm.auto import tqdm\n", + "# Reset variables for if we run this multiple times\n", + "all_pros = []\n", + "all_names = []\n", + "all_titles = []\n", + "all_sources = []\n", + "\n", + "print(\"Loading in data... This will take a while.\")\n", + "\n", + "for i in tqdm(range(dataset.shape[0])):\n", + "\n", + " post = dataset.iloc[i]\n", + " modified_title = post.title.replace('CMV', \"Change my mind\")\n", + " #print(f\"\\n Loading entry {i+1}/{dataset.shape[0]}:\\n\\t\\\"{modified_title}\\\"\")\n", + "\n", + " if type(post) == type(None):\n", + " continue\n", + "\n", + " assert(post.post_id != i)\n", + "\n", + " pros = get_top_comment_and_clean_data(post.post_id)\n", + "\n", + " if post.title == \"[deleted]\":\n", + " continue\n", + "\n", + " pros = \" \".join([*set(pros)])\n", + " pros = pros.replace(\"[deleted]\",\"\")\n", + "\n", + " post.selftext = cleanup_body_text(post.selftext)\n", + " all_titles.append(modified_title + \" \" + post.selftext)\n", + " all_pros.append(pros)\n", + " all_names.append(post.name)\n", + " all_sources.append(f\"https://reddit.com/r/changemyview/comments/{post.post_id}\")\n", + " #print(post.title)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "outputs": [ + { + "data": { + "text/plain": "'it\\'s already been signed. They even claim to be adhering to it, though they\\'ve been found to be violating it before. There is no such thing as \"de facto acceptance of Israel\\'s nuclear program.\" the Non-Proliferation Treaty is only binding for signatory states. Israel is not a signatory. Article 10 of the NPT allows them to withdraw if they so choose. they have not done so. a whole new country which explicitly has a right to withdraw from the NPT and has not chosen to do so. It\\'s more accurate, I think, to say that the problem with Iran here from a legal standpoint is that they aren\\'t honoring their own commitments, rather than that they\\'re building weapons. They could pull out of the NPT at any time, and the ball would be essentially in America\\'s court, because their nuclear program would no longer be illegal by international legal standards. However, Iran insists both on developing nukes *and* remaining an NPT signatory non-nuclear state, and that\\'s what makes their program illegal. I\\'d also like to clarify that I\\'m not making an ethical argument here, this is just how international law currently works. because international law doesn\\'t require states to sign treaties, it only requires them to adhere to treaties they\\'ve already signed. Israel isn\\'t defying the UN, at least not in this particular case. Think of the NPT less like a standard law within a state and more like a contract. Once you\\'ve signed, you\\'re bound by the contract, but if you never sign it then you haven\\'t broken a law, you\\'ve just decided not to agree to the terms you were offered. > Because Iran did sign the treaty, and thus are bound by it. They signed on July 1, 1968. Hmm. So is the argument here that it\\'s not \"ok\" for Iran to have a nuke, since they signed treaty not to do so. But it\\'s \"ok\" for Israel to have one because they never signed such thing? Can\\'t quite put my finger on it, but doesn\\'t seem quite right this one.'" + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_pros[1]" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 80, + "outputs": [], + "source": [ + "# Place it all into a Pandas Dataframe\n", + "clean_df = pd.DataFrame({\n", + " \"INSTRUCTION\": all_titles,\n", + " \"RESPONSE\": all_pros,\n", + " \"SOURCE\": all_sources\n", + "}, index=all_names\n", + ")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "# Create Apache Paquete file\n", + "\n", + "import pyarrow as pa\n", + "import pyarrow.parquet as pq\n", + "\n", + "table = pa.Table.from_pandas(clean_df)\n", + "pq.write_table(table,\"output.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "outputs": [ + { + "data": { + "text/plain": " INSTRUCTION \\\n0 Change my mind: I shouldn't get a job in this ... \n1 Change my mind: Iran has the right to develop ... \n2 Change my mind: The events in Paris suck...but... \n3 Change my mind: It is ok to hate a religion so... \n4 Change my mind: There is no productive reason ... \n5 Change my mind: Diet soda is perfectly healthy... \n6 Change my mind:Essential Oils are bullshit My ... \n7 Change my mind: I think the Paris shooting mak... \n8 Change my mind: Printing an image of the Musli... \n9 Change my mind: Philosophy has no tangible val... \n\n RESPONSE \\\n0 That is what someone in the 1500s would have s... \n1 it's already been signed. They even claim to b... \n2 Hm I guess I made the OP incorrectly. The mai... \n3 I don't understand your analogy. Promoting a ... \n4 ∆ I hadn't thought it from a \"let's trick peop... \n5 Thanks for a fresh argument! I hadn't conside... \n6 Most do. Some smell kinda funky. \n7 I already said in different comments that thi... \n8 The first bacon sandwich came about because 9... \n9 >Why restrict it to 50 years? I can name all s... \n\n SOURCE \n0 https://reddit.com/r/changemyview/comments/t3_... \n1 https://reddit.com/r/changemyview/comments/t3_... \n2 https://reddit.com/r/changemyview/comments/t3_... \n3 https://reddit.com/r/changemyview/comments/t3_... \n4 https://reddit.com/r/changemyview/comments/t3_... \n5 https://reddit.com/r/changemyview/comments/t3_... \n6 https://reddit.com/r/changemyview/comments/t3_... \n7 https://reddit.com/r/changemyview/comments/t3_... \n8 https://reddit.com/r/changemyview/comments/t3_... \n9 https://reddit.com/r/changemyview/comments/t3_... ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
INSTRUCTIONRESPONSESOURCE
0Change my mind: I shouldn't get a job in this ...That is what someone in the 1500s would have s...https://reddit.com/r/changemyview/comments/t3_...
1Change my mind: Iran has the right to develop ...it's already been signed. They even claim to b...https://reddit.com/r/changemyview/comments/t3_...
2Change my mind: The events in Paris suck...but...Hm I guess I made the OP incorrectly. The mai...https://reddit.com/r/changemyview/comments/t3_...
3Change my mind: It is ok to hate a religion so...I don't understand your analogy. Promoting a ...https://reddit.com/r/changemyview/comments/t3_...
4Change my mind: There is no productive reason ...∆ I hadn't thought it from a \"let's trick peop...https://reddit.com/r/changemyview/comments/t3_...
5Change my mind: Diet soda is perfectly healthy...Thanks for a fresh argument! I hadn't conside...https://reddit.com/r/changemyview/comments/t3_...
6Change my mind:Essential Oils are bullshit My ...Most do. Some smell kinda funky.https://reddit.com/r/changemyview/comments/t3_...
7Change my mind: I think the Paris shooting mak...I already said in different comments that thi...https://reddit.com/r/changemyview/comments/t3_...
8Change my mind: Printing an image of the Musli...The first bacon sandwich came about because 9...https://reddit.com/r/changemyview/comments/t3_...
9Change my mind: Philosophy has no tangible val...>Why restrict it to 50 years? I can name all s...https://reddit.com/r/changemyview/comments/t3_...
\n
" + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Test to see if it was sucessful\n", + "table = pq.read_table(\"output.parquet\")\n", + "table.to_pandas()" + ], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/oasst-shared/oasst_shared/exceptions/oasst_api_error.py b/oasst-shared/oasst_shared/exceptions/oasst_api_error.py index 0a548ebb..e7f5dcd5 100644 --- a/oasst-shared/oasst_shared/exceptions/oasst_api_error.py +++ b/oasst-shared/oasst_shared/exceptions/oasst_api_error.py @@ -38,6 +38,8 @@ class OasstErrorCode(IntEnum): TASK_REQUESTED_TYPE_NOT_AVAILABLE = 1006 TASK_AVAILABILITY_QUERY_FAILED = 1007 TASK_MESSAGE_TOO_LONG = 1008 + TASK_MESSAGE_DUPLICATED = 1009 + TASK_MESSAGE_TEXT_EMPTY = 1010 # 2000-3000: prompt_repository INVALID_FRONTEND_MESSAGE_ID = 2000 @@ -76,6 +78,8 @@ class OasstErrorCode(IntEnum): USER_DISABLED = 4001 USER_NOT_FOUND = 4002 + EMOJI_OP_UNSUPPORTED = 5000 + class OasstError(Exception): """Base class for Open-Assistant exceptions.""" diff --git a/oasst-shared/oasst_shared/schemas/inference.py b/oasst-shared/oasst_shared/schemas/inference.py index 0acb5014..91a16b61 100644 --- a/oasst-shared/oasst_shared/schemas/inference.py +++ b/oasst-shared/oasst_shared/schemas/inference.py @@ -13,7 +13,11 @@ class WorkRequest(pydantic.BaseModel): conversation: protocol.Conversation = pydantic.Field(..., repr=False) model_name: str = "distilgpt2" max_new_tokens: int = 100 - seed: int = pydantic.Field(default_factory=lambda: random.randint(0, 2**32 - 1)) + seed: int = pydantic.Field(default_factory=lambda: random.randint(-(2**31), 2**31 - 1)) + do_sample: bool = True + top_k: int = 50 + top_p: float = 0.9 + temperature: float = 1.0 class WorkResponsePacket(pydantic.BaseModel): diff --git a/oasst-shared/oasst_shared/schemas/protocol.py b/oasst-shared/oasst_shared/schemas/protocol.py index 20bbdf9b..cbb4d29c 100644 --- a/oasst-shared/oasst_shared/schemas/protocol.py +++ b/oasst-shared/oasst_shared/schemas/protocol.py @@ -57,6 +57,8 @@ class ConversationMessage(BaseModel): text: str lang: Optional[str] # BCP 47 is_assistant: bool + emojis: Optional[dict[str, int]] = None + user_emojis: Optional[list[str]] = None class Conversation(BaseModel): @@ -222,27 +224,43 @@ class LabelTaskMode(str, enum.Enum): full = "full" -class LabelInitialPromptTask(Task): - """A task to label an initial prompt.""" +class LabelTaskDisposition(str, enum.Enum): + """Reason why the task was issued.""" - type: Literal["label_initial_prompt"] = "label_initial_prompt" + quality = "quality" + spam = "spam" + + +class LabelDescription(BaseModel): + name: str + widget: str + display_text: str + help_text: Optional[str] + + +class AbstractLabelTask(Task): message_id: UUID - prompt: str valid_labels: list[str] mandatory_labels: Optional[list[str]] mode: Optional[LabelTaskMode] + disposition: Optional[LabelTaskDisposition] + labels: Optional[list[LabelDescription]] -class LabelConversationReplyTask(Task): +class LabelInitialPromptTask(AbstractLabelTask): + """A task to label an initial prompt.""" + + type: Literal["label_initial_prompt"] = "label_initial_prompt" + prompt: str + + +class LabelConversationReplyTask(AbstractLabelTask): """A task to label a reply to a conversation.""" type: Literal["label_conversation_reply"] = "label_conversation_reply" conversation: Conversation # the conversation so far - message_id: UUID + reply_message: Optional[ConversationMessage] reply: str - valid_labels: list[str] - mandatory_labels: Optional[list[str]] - mode: Optional[LabelTaskMode] class LabelPrompterReplyTask(LabelConversationReplyTask): @@ -315,39 +333,48 @@ class MessageRanking(Interaction): ranking: conlist(item_type=int, min_items=1) +class LabelWidget(str, enum.Enum): + yes_no = "yes_no" + flag = "flag" + likert = "likert" + + class TextLabel(str, enum.Enum): """A label for a piece of text.""" - def __new__(cls, label: str, display_text: str = "", help_text: str = None): + def __new__(cls, label: str, widget: LabelWidget, display_text: str = "", help_text: str = None): obj = str.__new__(cls, label) obj._value_ = label + obj.widget = widget obj.display_text = display_text obj.help_text = help_text return obj - spam = "spam", "Seems to be intentionally low-quality or irrelevant" - fails_task = "fails_task", "Fails to follow the correct instruction / task" - not_appropriate = "not_appropriate", "Inappropriate for customer assistant" - violence = "violence", "Encourages or fails to discourage violence/abuse/terrorism/self-harm" - excessive_harm = ( - "excessive_harm", - "Content likely to cause excessive harm not justifiable in the context", - "Harm refers to physical or mental damage or injury to someone or something. Excessive refers to a reasonable threshold of harm in the context, for instance damaging skin is not excessive in the context of surgery.", - ) - sexual_content = "sexual_content", "Contains sexual content" - toxicity = "toxicity", "Contains rude, abusive, profane or insulting content" - moral_judgement = "moral_judgement", "Expresses moral judgement" - political_content = "political_content", "Expresses political views" - humor = "humor", "Contains humorous content including sarcasm" + # yes/no questions + spam = "spam", LabelWidget.yes_no, "Seems to be intentionally low-quality or irrelevant" + fails_task = "fails_task", LabelWidget.yes_no, "Fails to follow the correct instruction / task" + + # flags + pii = "pii", LabelWidget.flag, "Contains personal identifiable information (PII)" + not_appropriate = "not_appropriate", LabelWidget.flag, "Inappropriate" hate_speech = ( "hate_speech", + LabelWidget.flag, "Content is abusive or threatening and expresses prejudice against a protected characteristic", - "Prejudice refers to preconceived views not based on reason. Protected characteristics include gender, ethnicity, religion, sexual orientation, and similar characteristics.", + "Prejudice refers to preconceived views not based on reason. Protected characteristics " + "include gender, ethnicity, religion, sexual orientation, and similar characteristics.", ) - threat = "threat", "Contains a threat against a person or persons" - misleading = "misleading", "Contains text which is incorrect or misleading" - helpful = "helpful", "Completes the task to a high standard" - creative = "creative", "Expresses creativity in responding to the task" + sexual_content = "sexual_content", LabelWidget.flag, "Contains sexual content" + moral_judgement = "moral_judgement", LabelWidget.flag, "Expresses moral judgement" + political_content = "political_content", LabelWidget.flag, "Expresses political views" + + # likert + quality = "quality", LabelWidget.likert, "Overall subjective quality rating of the message" + toxicity = "toxicity", LabelWidget.likert, "Rude, abusive, profane or insulting content" + humor = "humor", LabelWidget.likert, "Humorous content including sarcasm" + helpfulness = "helpfulness", LabelWidget.likert, "Helpfulness of the message" + creativity = "creativity", LabelWidget.likert, "Creativity" + violence = "violence", LabelWidget.likert, "Violence/abuse/terrorism/self-harm" class TextLabels(Interaction): @@ -358,6 +385,7 @@ class TextLabels(Interaction): labels: dict[TextLabel, float] message_id: UUID task_id: Optional[UUID] + is_report: Optional[bool] @property def has_message_id(self) -> bool: @@ -432,3 +460,29 @@ class OasstErrorResponse(BaseModel): error_code: OasstErrorCode message: str + + +class EmojiCode(str, enum.Enum): + thumbs_up = "+1" # 👍 + thumbs_down = "-1" # 👎 + red_flag = "red_flag" # 🚩 + hundred = "100" # 💯 + rofl = "rofl" # 🤣 + clap = "clap" # 👏 + diamond = "diamond" # 💎 + heart_eyes = "heart_eyes" # 😍 + disappointed = "disappointed" # 😞 + poop = "poop" # 💩 + skull = "skull" # 💀 + + +class EmojiOp(str, enum.Enum): + togggle = "toggle" + add = "add" + remove = "remove" + + +class MessageEmojiRequest(BaseModel): + user: User + op: EmojiOp = EmojiOp.togggle + emoji: EmojiCode diff --git a/openassistant/datasets/mt_note_generation/README.md b/openassistant/datasets/mt_note_generation/README.md new file mode 100644 index 00000000..74754417 --- /dev/null +++ b/openassistant/datasets/mt_note_generation/README.md @@ -0,0 +1,101 @@ +--- +annotations_creators: + - no-annotation +language: + - en +language_creators: + - machine-generated +license: + - mit +multilinguality: + - monolingual +pretty_name: "MT Note Generation" +size_categories: + - <500 +source_datasets: + - mt_samples +tags: + - open-assistant +task_categories: + - conversational +task_ids: + - dialogue-generation +dataset_info: + features: + - name: conversation + dtype: string + splits: + - name: train + num_bytes: 1060800 + num_examples: 270 + - name: test + num_bytes: 339323 + num_examples: 90 + - name: validation + num_bytes: 337018 + num_examples: 90 +--- + +# Dataset Card for MT Samples Clinical Note Generation + +## Dataset Description + +- **Repository:** + [Code](https://github.com/LAION-AI/Open-Assistant/openassistant/datasets/mt_note_generation/) + +## Dataset Summary + +MT Note Generation is a set of synthetic dialogues between Assistant and User +where the user asks the assistant to generate a clinical note for a patient +persona. + +This data was created by synthesizing the dialogues in +[MT Samples](https://mtsamples.com/) for clinical note generation. + +Example: + +``` +User: Write a clinical note about a patient with the following chief complaint: Stomach pain for 2 weeks. + +ASSISTANT: + +CHIEF COMPLAINT: Stomach pain for 2 weeks. HISTORY OF PRESENT ILLNESS: The patient is a 45yo Mexican man without significant past medical history who presents to the emergency room with complaints of mid-epigastric and right upper quadrant abdominal pain for the last 14 days. The pain was initially crampy and burning in character and was relieved with food intake. He also reports that it initially was associated with a sour taste in his mouth. He went to his primary care physician who prescribed cimetidine 400mg qhs x 5 days; however, this did not relieve his symptoms. In fact, the pain has worsened such that the pain now radiates to the back but is waxing and waning in duration. It is relieved with standing and ambulation and exacerbated when lying in a supine position. He reports a decrease in appetite associated with a 4 lb. wt loss over the last 2 wks. He does have nausea with only one episode of non-bilious, non-bloody emesis on day of admission. He reports a 2 wk history of subjective fever and diaphoresis. He denies any diarrhea, constipation, dysuria, melena, or hematochezia. His last bowel movement was during the morning of admission and was normal. He denies any travel in the last 9 years and sick contacts.PAST MEDICAL HISTORY: Right inguinal groin cyst removal 15 years ago. Unknown etiology. No recurrence. + +PAST SURGICAL HISTORY: Left femoral neck fracture with prosthesis secondary to a fall 4 years ago. + +FAMILY HISTORY: Mother with diabetes. No history of liver disease. No malignancies. + +SOCIAL HISTORY: The patient was born in central Mexico but moved to the United States 9 years ago. He is on disability due to his prior femoral fracture. He denies any tobacco or illicit drug use. He only drinks alcohol socially, no more than 1 drink every few weeks. He is married and has 3 healthy children. He denies any tattoos or risky sexual behavior. + +ALLERGIES: NKDA. + +MEDICATIONS: Tylenol prn (1-2 tabs every other day for the last 2 wks), Cimetidine 400mg po qhs x 5 days. + +REVIEW OF SYSTEMS: No headache, vision changes. No shortness of breath. No chest pain or palpitations. + +PHYSICAL EXAMINATION: Vitals: T 100.9-102.7 BP 136/86 Pulse 117 RR 12 98% sat on room air,Gen: Well-developed, well-nourished, no apparent distress.HEENT: Pupils equal, round and reactive to light. Anicteric. Oropharynx clear and moist.Neck: Supple. No lymphadenopathy or carotid bruits. No thyromegaly or masses.CHEST: Clear to auscultation bilaterally.CV: Tachycardic but regular rhythm, normal S1/S2, no murmurs/rubs/gallops.Abd: Soft, active bowel sounds. Tender in the epigastrium and right upper quadrant with palpation associated with slight guarding. No rebound tenderness. No hepatomegaly. No splenomegaly.Rectal: Stool was brown and guaiac negative.Ext: No cyanosis/clubbing/edema.Neurological: He was alert and oriented x3. CN II-XII intact. Normal 2+ DTRs. No focal neurological deficit.Skin: No jaundice. No skin rashes or lesions. + +IMAGING DATA:CT Abdomen with contrast ( 11/29/03 ): There is a 6x6 cm multilobular hypodense mass seen at the level of the hepatic hilum and caudate lobe which is resulting in mass effect with dilatation of the intrahepatic radicals of the left lobe of the liver. The rest of the liver parenchyma is homogeneous. The gallbladder, pancreas, spleen, adrenal glands and kidneys are within normal limits. The retroperitoneal vascular structures are within normal limits. There is no evidence of lymphadenopathy, free fluid or fluid collections.HOSPITAL COURSE: The patient was admitted to the hospital for further evaluation. A diagnostic procedure was performed. +``` + +## Usage + +The dataset contains one configuration, `dialogue_modeling`, which has a single +text `conversation` feature. + +## Source data + +The script modifies data from mtsamples.csv which is hosted in Kaggle: +https://www.kaggle.com/datasets/tboyle10/medicaltranscriptions + +## Citation + +Please cite our work if you find the resources in this repository useful: + +``` +@article{pruks2023mtsamplesnotegen, + author = {Yada Pruksachatkun}, + title = {MT Samples Note Generation}, + year = {2023} +} +``` diff --git a/website/src/.prettierignore b/openassistant/datasets/mt_note_generation/__init__.py similarity index 100% rename from website/src/.prettierignore rename to openassistant/datasets/mt_note_generation/__init__.py diff --git a/openassistant/datasets/mt_note_generation/hub.py b/openassistant/datasets/mt_note_generation/hub.py new file mode 100644 index 00000000..296ecee4 --- /dev/null +++ b/openassistant/datasets/mt_note_generation/hub.py @@ -0,0 +1,21 @@ +from dataclasses import dataclass + +import datasets + + +@dataclass +class OpenAssistantConfig(datasets.BuilderConfig): + """BuilderConfig for OpenAssistant datasets.""" + + name: str = None + version: datasets.Version = None + description: str = None + schema: str = None + subset_id: str = None + + +features = datasets.Features( + { + "conversation": datasets.Value("string"), + } +) diff --git a/openassistant/datasets/mt_note_generation/mt_note_generation.py b/openassistant/datasets/mt_note_generation/mt_note_generation.py new file mode 100644 index 00000000..d78b91b8 --- /dev/null +++ b/openassistant/datasets/mt_note_generation/mt_note_generation.py @@ -0,0 +1,123 @@ +# Copyright 2023 The OpenAssistant Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +MT Note Generation is a set of synthetic dialogues between Assistant and +User where the user asks the assistant to generate a clinical note for a patient persona. +""" + +import json +from typing import Dict, List, Tuple + +import datasets + +from .hub import OpenAssistantConfig, features + +_CITATION = """\ + @misc{transcribed medical transcription sample reports and examples, title={Welcome to MTSamples}, + url={https://mtsamples.com/}, + journal={Transcribed Medical Transcription Sample Reports and Examples}} +""" + +_DATASETNAME = "mt_note_generation" +_DISPLAYNAME = "MT Samples Note Generation" + +_DESCRIPTION = """\ +A dataset of instructions for generating clinical notes from MT samples. +""" + +_HOMEPAGE = "" + +_LICENSE = "mit" + +_URLS = { + _DATASETNAME: { + "train": "./data/mt_note_generation_train.jsonl", + "test": "./data/mt_note_generation_test.jsonl", + "validation": "./data/mt_note_generation_validation.jsonl", + } +} + +_SUPPORTED_TASKS = ["dialogue-modeling"] + +_VERSION = "1.0.0" + + +class MTNoteGenerationDataset(datasets.GeneratorBasedBuilder): + """A set of dialogues synthesized from the MT Samples dataset.""" + + VERSION = datasets.Version(_VERSION) + + BUILDER_CONFIGS = [ + OpenAssistantConfig( + name=f"{_DATASETNAME}_dialogue_modeling", + version=VERSION, + description=f"OpenAssistant dataset config for {_DATASETNAME}", + schema="dialogue_modeling", + subset_id=_DATASETNAME, + ) + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_dialogue_modeling" + + def _info(self) -> datasets.DatasetInfo: + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": data_dir, + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_dir, + "split": "test", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": data_dir, + "split": "validation", + }, + ), + ] + + def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + if self.config.schema == "dialogue_modeling": + key = 0 + with open(filepath[split], "r", encoding="utf8") as data: + while True: + line = data.readline() + if not line: + return + yield key, json.loads(line) + key += 1 diff --git a/openassistant/datasets/mt_note_generation/prepare.py b/openassistant/datasets/mt_note_generation/prepare.py new file mode 100644 index 00000000..7f0a9146 --- /dev/null +++ b/openassistant/datasets/mt_note_generation/prepare.py @@ -0,0 +1,84 @@ +import json +import math +import os +import random +import re +import sys +from string import punctuation + +import kaggle +import pandas as pd + +CLINICAL_NOTE_GENERATION_TEMPLATE = """User: Write a clinical note about a patient with the following {section}: {section_information}. +Rosey: {note}""" + + +def preprocess(mt_dataset): + def filter_for_notes(row): + normalized_transcript = row["transcription"].lower() + if "chief complaint:" in normalized_transcript: + return True + return False + + mt_dataset = mt_dataset.dropna(subset=["description", "transcription"]) + mt_note_subset = mt_dataset[mt_dataset.apply(filter_for_notes, axis=1)] + return mt_note_subset + + +def is_chief_complaint(section): + return "chief complaint" in section.lower() + + +def get_conversations(dataset): + def normalize_transcript(x): + x = re.sub(r"\.+", ".", x) + x = re.sub(r"\,+", ",", x) + x = re.sub(r":\s+", ": ", x) + x = re.sub(r"\.\s+", ". ", x) + x = re.sub(r":(\s)*\,+", ": ", x) + x = re.sub(r"\.\,+", ". ", x) + return x + + conversations = [] + for idx in range(len(dataset)): + transcript = normalize_transcript(dataset.iloc[idx]["transcription"]) + sections = re.findall(r"\b[A-Z]+(?: [A-Z]+)*:", transcript) + if len(sections) >= 2: + note_prompt = transcript.split(sections[0])[1].split(sections[1])[0] + else: + continue + section_name = sections[0].lower().strip(punctuation) + if len(note_prompt.split(" ")) > 30 and is_chief_complaint(section_name): + # There are some chief complaints that seem to be HPI + section_name = "history of present illness" + conversations.append( + CLINICAL_NOTE_GENERATION_TEMPLATE.format( + section=section_name, section_information=note_prompt, note=transcript + ) + ) + return conversations + + +def main(output_dir: str = "data"): + """Download and prepare the dataset for use.""" + os.makedirs(output_dir, exist_ok=True) + kaggle.api.dataset_download_files("tboyle10/medicaltranscriptions", "data", unzip=True) + mt_samples = preprocess(pd.read_csv("mtsamples.csv")) + conversations = get_conversations(mt_samples) + random.shuffle(conversations) + train_limit = math.ceil(len(conversations) * 0.6) + dev_limit = math.ceil(len(conversations) * 0.8) + train, validation, test = ( + conversations[:train_limit], + conversations[train_limit:dev_limit], + conversations[dev_limit:], + ) + splits = {"train": train, "validation": validation, "test": test} + for split in ["train", "validation", "test"]: + with open(f"{output_dir}/mt_note_generation_{split}.jsonl", "w") as f: + for conversation in splits[split]: + f.write(f"{json.dumps({'conversation': conversation})}\n") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/backend-development/run-local.sh b/scripts/backend-development/run-local.sh index 7366cde6..064fd843 100755 --- a/scripts/backend-development/run-local.sh +++ b/scripts/backend-development/run-local.sh @@ -7,6 +7,7 @@ pushd "$parent_path/../../backend" export DEBUG_USE_SEED_DATA=True export DEBUG_SKIP_TOXICITY_CALCULATION=True export DEBUG_ALLOW_SELF_LABELING=True +export DEBUG_ALLOW_DUPLICATE_TASKS=True export DEBUG_SKIP_EMBEDDING_COMPUTATION=True uvicorn main:app --reload --port 8080 --host 0.0.0.0 diff --git a/website/.prettierignore b/website/.prettierignore new file mode 100644 index 00000000..0be2a485 --- /dev/null +++ b/website/.prettierignore @@ -0,0 +1,3 @@ +.eslintrc.json +tailwind.config.js +.storybook/* diff --git a/website/src/.prettierrc.json b/website/.prettierrc.json similarity index 100% rename from website/src/.prettierrc.json rename to website/.prettierrc.json diff --git a/website/.storybook/main.js b/website/.storybook/main.js index ea519486..d92f6562 100644 --- a/website/.storybook/main.js +++ b/website/.storybook/main.js @@ -10,6 +10,7 @@ module.exports = { "@storybook/addon-essentials", "@storybook/addon-interactions", "@chakra-ui/storybook-addon", + "storybook-addon-next-router", ], framework: "@storybook/react", core: { @@ -23,7 +24,12 @@ module.exports = { config.resolve.alias = { ...config.resolve.alias, src: path.resolve(__dirname, "../src"), + styles: path.resolve(__dirname, "../styles"), }; + config.resolve.fallback = { + fs: false, + path: require.resolve('path-browserify'), + } return config; }, features: { diff --git a/website/.storybook/preview.js b/website/.storybook/preview.js index 39a4bb8e..cc1487fb 100644 --- a/website/.storybook/preview.js +++ b/website/.storybook/preview.js @@ -1,4 +1,5 @@ import "!style-loader!css-loader!postcss-loader!tailwindcss/tailwind.css"; +import { RouterContext } from "next/dist/shared/lib/router-context"; export const parameters = { actions: { argTypesRegex: "^on[A-Z].*" }, @@ -8,6 +9,9 @@ export const parameters = { date: /Date$/, }, }, + nextRouter: { + Provider: RouterContext.Provider, + }, }; // Hacky solution to get Images in next to work diff --git a/website/README.md b/website/README.md index a30f2754..09b40e49 100644 --- a/website/README.md +++ b/website/README.md @@ -2,8 +2,7 @@ ## Purpose -This provides a comprehensive webapp interface for LAION's Open Assistant -project. Initially it will support: +This provides a comprehensive webapp interface for LAION's Open Assistant project. Initially it will support: 1. User registration using either Discord or Email. 1. Adding responses to incomplete Open Assistant tasks. @@ -11,8 +10,7 @@ project. Initially it will support: 1. Viewing an activity leaderboard. 1. Tracking community wide updates. -This interface compliments the Discord bot and will give access to the same -underlying tasks. +This interface compliments the Discord bot and will give access to the same underlying tasks. ## Contributing @@ -22,67 +20,54 @@ This website is built using: 1. [npm](https://www.npmjs.com/): The node package manager for building. 1. [React](https://reactjs.org/): The core frontend framework. -1. [Next.js](https://nextjs.org/): A React scaffolding framework to streamline - development. -1. [Prisma](https://www.prisma.io/): An ORM to interact with a web specific - [Postgres](https://www.postgresql.org/) database. -1. [NextAuth.js](https://next-auth.js.org/): A user authentication framework to - ensure we handle accounts with best practices. -1. [TailwindCSS](https://tailwindcss.com/): A general purpose framework for - styling any component. -1. [Chakra-UI](https://chakra-ui.com/): A wide collection of pre-built UI - components that generally look pretty good. +1. [Next.js](https://nextjs.org/): A React scaffolding framework to streamline development. +1. [Prisma](https://www.prisma.io/): An ORM to interact with a web specific [Postgres](https://www.postgresql.org/) + database. +1. [NextAuth.js](https://next-auth.js.org/): A user authentication framework to ensure we handle accounts with best + practices. +1. [TailwindCSS](https://tailwindcss.com/): A general purpose framework for styling any component. +1. [Chakra-UI](https://chakra-ui.com/): A wide collection of pre-built UI components that generally look pretty good. ### Set up your environment -To contribute to the website, make sure you have the following setup and -installed: +To contribute to the website, make sure you have the following setup and installed: -1. [NVM](https://github.com/nvm-sh/nvm): The Node Version Manager makes it easy - to ensure you have the right NodeJS version installed. Once installed, run - `nvm use 16` to use Node 16.x. The website is known to be stable with NodeJS +1. [NVM](https://github.com/nvm-sh/nvm): The Node Version Manager makes it easy to ensure you have the right NodeJS + version installed. Once installed, run `nvm use 16` to use Node 16.x. The website is known to be stable with NodeJS version 16.x. This will install both Node and NPM. -1. [Docker](https://www.docker.com/): We use docker to simplify running - dependent services. +1. [Docker](https://www.docker.com/): We use docker to simplify running dependent services. ### Getting everything up and running If you're doing active development we suggest the following workflow: 1. In one tab, navigate to the project root. -1. Run `docker compose up frontend-dev --build --attach-dependencies`. You can - optionally include `-d` to detach and later track the logs if desired. +1. Run `docker compose up frontend-dev --build --attach-dependencies`. You can optionally include `-d` to detach and + later track the logs if desired. 1. In another tab navigate to `${OPEN_ASSISTANT_ROOT/website`. 1. Run `npm ci` -1. Run `npx prisma db push` (This is also needed when you restart the docker - stack from scratch). -1. Run `npm run dev`. Now the website is up and running locally at - `http://localhost:3000`. -1. To create an account, login via the user using email authentication and - navigate to `http://localhost:1080`. Check the email listed and click the - log in link. You're now logged in and authenticated. +1. Run `npx prisma db push` (This is also needed when you restart the docker stack from scratch). +1. Run `npm run dev`. Now the website is up and running locally at `http://localhost:3000`. +1. To create an account, login via the user using email authentication and navigate to `http://localhost:1080`. Check + the email listed and click the log in link. You're now logged in and authenticated. ### Using debug user credentials -You can use the debug credentials provider to log in without fancy emails or -OAuth. +You can use the debug credentials provider to log in without fancy emails or OAuth. -1. This feature is automatically on in development mode, i.e. when you run - `npm run dev`. In case you want to do the same with a production build (for - example, the docker image), then run the website with environment variable +1. This feature is automatically on in development mode, i.e. when you run `npm run dev`. In case you want to do the + same with a production build (for example, the docker image), then run the website with environment variable `DEBUG_LOGIN=true`. 1. Use the `Login` button in the top right to go to the login page. -1. You should see a section for debug credentials. Enter any username you wish, - you will be logged in as that user. +1. You should see a section for debug credentials. Enter any username you wish, you will be logged in as that user. ### Using Storybook -To develop components using [Storybook](https://storybook.js.org/) run -`npm run storybook`. Then navigate to in your browser to -`http://localhost:6006`. +To develop components using [Storybook](https://storybook.js.org/) run `npm run storybook`. Then navigate to in your +browser to `http://localhost:6006`. -To create a new story create a file named `[componentName].stories.js`. An -example how such a story could look like, see `Header.stories.jsx`. +To create a new story create a file named `[componentName].stories.js`. An example how such a story could look like, see +`Header.stories.jsx`. ## Code Layout @@ -90,12 +75,10 @@ example how such a story could look like, see `Header.stories.jsx`. All react code is under `src/` with a few sub directories: -1. `pages/`: All pages a user could navigate too and API URLs which are under - `pages/api/`. -1. `components/`: All re-usable React components. If something gets used twice - we should create a component and put it here. -1. `lib/`: A generic place to store library files that are used anywhere. This - doesn't have much structure yet. +1. `pages/`: All pages a user could navigate too and API URLs which are under `pages/api/`. +1. `components/`: All re-usable React components. If something gets used twice we should create a component and put it + here. +1. `lib/`: A generic place to store library files that are used anywhere. This doesn't have much structure yet. NOTE: `styles/` can be ignored for now. @@ -113,25 +96,20 @@ We're not really using CSS styles. `styles/` can be ignored. ## Testing the UI -Cypress is used for end-to-end (e2e) and component testing and is configured in -`./cypress.config.ts`. The `./cypress` folder is used for supporting -configuration files etc. +Cypress is used for end-to-end (e2e) and component testing and is configured in `./cypress.config.ts`. The `./cypress` +folder is used for supporting configuration files etc. - Store e2e tests in the `./cypress/e2e` folder. -- Store component tests adjacent to the component being tested. If you want to - wriite a test for `./src/components/Layout.tsx` then store the test file at - `./src/components/Layout.cy.tsx`. +- Store component tests adjacent to the component being tested. If you want to wriite a test for + `./src/components/Layout.tsx` then store the test file at `./src/components/Layout.cy.tsx`. A few npm scripts are available for convenience: -- `npm run cypress`: Useful for development, it opens Cypress and allows you to - explore, run and debug tests. It assumes you have the NextJS site running at - `localhost:3000`. -- `npm run cypress:run`: Runs all tests. Useful for a quick sanity check before - sending a PR or to run in CI pipelines. -- `npm run cypress:image-baseline`: If you have tests failing because of visual - changes that was expected, this command will update the baseline images stored - in `./cypress-visual-screenshots/baseline` with those from the adjacent +- `npm run cypress`: Useful for development, it opens Cypress and allows you to explore, run and debug tests. It assumes + you have the NextJS site running at `localhost:3000`. +- `npm run cypress:run`: Runs all tests. Useful for a quick sanity check before sending a PR or to run in CI pipelines. +- `npm run cypress:image-baseline`: If you have tests failing because of visual changes that was expected, this command + will update the baseline images stored in `./cypress-visual-screenshots/baseline` with those from the adjacent comparison folder. More can be found in the [docs of `uktrade/cypress-image-diff`](https://github.com/uktrade/cypress-image-diff/blob/main/docs/CLI.md#update-all-baseline-images-for-failing-tests). @@ -141,10 +119,9 @@ Read more in the [./cypress README](cypress/). Jest and React Testing Library are used for unit testing JS/TS/TSX code. -- Store unit test files adjacent to the file being tested and have the filename - end with `.test.ts` for non-React code or `.test.tsx` for React code. -- `npm run jest`: automatically runs tests and watches for any relevant changes - to rerun tests. +- Store unit test files adjacent to the file being tested and have the filename end with `.test.ts` for non-React code + or `.test.tsx` for React code. +- `npm run jest`: automatically runs tests and watches for any relevant changes to rerun tests. Read more in the [./src/README.md](src/README.md). @@ -152,30 +129,25 @@ Read more in the [./src/README.md](src/README.md). When writing code for the website, we have a few best practices: -1. When importing packages import external dependencies first then local - dependencies. Order them alphabetically according to the package name. -1. When trying to implement something new, check if - [Chakra-UI](https://chakra-ui.com/) has components that are close enough to - your need. For example Sliders, Radio Buttons, Progress indicators, etc. - They have a lot and we can save time by re-using what they have and tweaking - the style as needed. -1. Format everything with [Prettier](https://prettier.io/). This is done by - default with pre-submits. We currently don't have any custom settings. -1. Define functional React components (with types for all properties when - feasible). +1. When importing packages import external dependencies first then local dependencies. Order them alphabetically + according to the package name. +1. When trying to implement something new, check if [Chakra-UI](https://chakra-ui.com/) has components that are close + enough to your need. For example Sliders, Radio Buttons, Progress indicators, etc. They have a lot and we can save + time by re-using what they have and tweaking the style as needed. +1. Format everything with [Prettier](https://prettier.io/). This is done by default with pre-submits. We currently + don't have any custom settings. +1. Define functional React components (with types for all properties when feasible). ### Developing New Features -When working on new features or making significant changes that can't be done -within a single Pull Request, we ask that you make use of Feature Flags. +When working on new features or making significant changes that can't be done within a single Pull Request, we ask that +you make use of Feature Flags. -We've set up -[`react-feature-flags`](https://www.npmjs.com/package/react-feature-flags) to -make this easier. To get started: +We've set up [`react-feature-flags`](https://www.npmjs.com/package/react-feature-flags) to make this easier. To get +started: -1. Add a new flag entry to `website/src/flags.ts`. We have an example flag you - can copy as an example. Be sure to `isActive` to true when testing your - features but false when submitting your PR. +1. Add a new flag entry to `website/src/flags.ts`. We have an example flag you can copy as an example. Be sure to + `isActive` to true when testing your features but false when submitting your PR. 1. Use your flag wherever you add a new UI element. This can be done with: ```js @@ -188,29 +160,24 @@ import { Flags } from "react-feature-flags"; You can see an example of how this works by checking `website/src/components/Header/Headers.tsx` where we use `flagTest`. -1. Once you've finished building out the feature and it is ready for everyone - to use, it's safe to remove the `Flag` wrappers around your component and - the entry in `flags.ts`. +1. Once you've finished building out the feature and it is ready for everyone to use, it's safe to remove the `Flag` + wrappers around your component and the entry in `flags.ts`. ### URL Paths -To use stable and consistent URL paths, we recommend the following strategy for -new tasks: +To use stable and consistent URL paths, we recommend the following strategy for new tasks: -1. For any task that involves writing a free-form response, put the page under - `website/src/pages/create` with a page name matching the task type, such as - `initial_prompt.tsx`. -1. For any task that evaluates, rates, or ranks content, put the page under - `website/src/pages/evaluate` with a page name matching the task type such as - `rank_initial_prompts.tsx`. +1. For any task that involves writing a free-form response, put the page under `website/src/pages/create` with a page + name matching the task type, such as `initial_prompt.tsx`. +1. For any task that evaluates, rates, or ranks content, put the page under `website/src/pages/evaluate` with a page + name matching the task type such as `rank_initial_prompts.tsx`. -With this we'll be able to ensure these contribution pages are hidden from -logged out users but accessible to logged in users. +With this we'll be able to ensure these contribution pages are hidden from logged out users but accessible to logged in +users. ## Learn More To learn more about Next.js, take a look at the following resources: -- [Next.js Documentation](https://nextjs.org/docs) - learn about Next.js - features and API. +- [Next.js Documentation](https://nextjs.org/docs) - learn about Next.js features and API. - [Learn Next.js](https://nextjs.org/learn) - an interactive Next.js tutorial. diff --git a/website/cypress/README.md b/website/cypress/README.md index 4750cbf6..d6a2b383 100644 --- a/website/cypress/README.md +++ b/website/cypress/README.md @@ -1,24 +1,19 @@ # Component and e2e testing with Cypress -[Cypress](https://www.cypress.io/) is used for both component- and end-to-end -testing. Below there's a few examples for the context of this site. To learn -more, the -[Cypress documentation](https://docs.cypress.io/guides/getting-started/opening-the-app) -has it all. +[Cypress](https://www.cypress.io/) is used for both component- and end-to-end testing. Below there's a few examples for +the context of this site. To learn more, the +[Cypress documentation](https://docs.cypress.io/guides/getting-started/opening-the-app) has it all. -Don't get scared by the commercial offerings they offer. Their core is open -source, the cloud offering is not necesarry at all and can be replaced by CI -tooling and [community efforts](https://sorry-cypress.dev/). +Don't get scared by the commercial offerings they offer. Their core is open source, the cloud offering is not necesarry +at all and can be replaced by CI tooling and [community efforts](https://sorry-cypress.dev/). # Component testing -To write a new component test, you either create a new `.tsx` adjacent to the -component you want to test or you can use the guide presented yo you when -running `npm run cypress` which allows you to easily create the skeleton test -for an existing component. +To write a new component test, you either create a new `.tsx` adjacent to the component you want to test or you can use +the guide presented yo you when running `npm run cypress` which allows you to easily create the skeleton test for an +existing component. -If you have a `Button.tsx` component, create a file next to it called -`Button.cy.tsx` which could look like this: +If you have a `Button.tsx` component, create a file next to it called `Button.cy.tsx` which could look like this: ```typescript import React from "react"; @@ -35,28 +30,24 @@ describe(" + + + + )} + + + {caption} + + {getHeaderGroups().map((headerGroup) => ( + + {headerGroup.headers.map((header) => ( + + ))} + + ))} + + + {getRowModel().rows.map((row) => ( + + {row.getVisibleCells().map((cell) => ( + + ))} + + ))} + +
+ + {header.isPlaceholder ? null : flexRender(header.column.columnDef.header, header.getContext())} + {(header.column.columnDef as DataTableColumnDef).filterable && ( + value.id === header.id)?.value ?? ""} + onChange={(value) => handleFilterChange({ id: header.id, value })} + label={flexRender(header.column.columnDef.header, header.getContext())} + > + )} + +
{flexRender(cell.column.columnDef.cell, cell.getContext())}
+
+ + ); +}; + +const FilterModal = ({ + label, + onChange, + value, +}: { + label: ReactNode; + onChange: (val: string) => void; + value: string; +}) => { + const { isOpen, onOpen, onClose } = useDisclosure(); + + const handleInputChange = useDebouncedCallback((e: ChangeEvent) => { + onChange(e.target.value); + }, 500); + + return ( + + + + + + + + + + {label} + + + + + + ); +}; diff --git a/website/src/components/EmptyState.tsx b/website/src/components/EmptyState.tsx index 14715518..63d8d3bf 100644 --- a/website/src/components/EmptyState.tsx +++ b/website/src/components/EmptyState.tsx @@ -1,30 +1,29 @@ -import { Box, Link, Text, useColorModeValue } from "@chakra-ui/react"; -import { useRouter } from "next/router"; -import { FiAlertTriangle } from "react-icons/fi"; -import { IconType } from "react-icons/lib"; +import { Box, Text, useColorModeValue } from "@chakra-ui/react"; +import { AlertTriangle, LucideIcon } from "lucide-react"; +import NextLink from "next/link"; type EmptyStateProps = { text: string; - icon: IconType; + icon: LucideIcon; + "data-cy"?: string; }; export const EmptyState = (props: EmptyStateProps) => { const backgroundColor = useColorModeValue("white", "gray.800"); - const router = useRouter(); return ( - + - {props.text} - router.back()} color="blue.500" textUnderlineOffset="3px"> - Click here to go back - + {props.text} + + Go back to the dashboard + ); }; export const TaskEmptyState = () => { - return ; + return ; }; diff --git a/website/src/components/Explain.tsx b/website/src/components/Explain.tsx new file mode 100644 index 00000000..b571757f --- /dev/null +++ b/website/src/components/Explain.tsx @@ -0,0 +1,39 @@ +import { + IconButton, + Popover, + PopoverArrow, + PopoverBody, + PopoverCloseButton, + PopoverContent, + PopoverTrigger, + Text, +} from "@chakra-ui/react"; +import { InformationCircleIcon } from "@heroicons/react/20/solid"; + +interface ExplainProps { + explanation: string[]; +} + +export const Explain = ({ explanation }: ExplainProps) => { + return ( + + + } + > + + + + + + {explanation.map((paragraph, idx) => ( + {paragraph} + ))} + + + + ); +}; diff --git a/website/src/components/Faq.tsx b/website/src/components/Faq.tsx index 55bb3585..46d5d19c 100644 --- a/website/src/components/Faq.tsx +++ b/website/src/components/Faq.tsx @@ -27,10 +27,10 @@ export function Faq() { return ( - {t(`faq_items.q${index}`)} + {t(`faq_items.q${index as 0}`)} - {t(`faq_items.a${index}`)} + {t(`faq_items.a${index as 0}`)} ); diff --git a/website/src/components/FlaggableElement.tsx b/website/src/components/FlaggableElement.tsx index 7e28f2c2..d7572080 100644 --- a/website/src/components/FlaggableElement.tsx +++ b/website/src/components/FlaggableElement.tsx @@ -1,127 +1,69 @@ import { Box, Button, - Checkbox, - Flex, + Modal, + ModalBody, + ModalCloseButton, + ModalContent, + ModalFooter, + ModalHeader, + ModalOverlay, Popover, PopoverAnchor, - PopoverArrow, - PopoverBody, - PopoverCloseButton, - PopoverContent, PopoverTrigger, - Slider, - SliderFilledTrack, - SliderThumb, - SliderTrack, Tooltip, - useBoolean, - useColorMode, useColorModeValue, - useId, + useDisclosure, } from "@chakra-ui/react"; -import { QuestionMarkCircleIcon } from "@heroicons/react/20/solid"; -import clsx from "clsx"; -import { useEffect, useReducer } from "react"; -import { FiAlertCircle } from "react-icons/fi"; +import { AlertCircle } from "lucide-react"; +import { useState } from "react"; import { get, post } from "src/lib/api"; import { colors } from "src/styles/Theme/colors"; import { Message } from "src/types/Conversation"; -import useSWR from "swr"; +import useSWRImmutable from "swr/immutable"; import useSWRMutation from "swr/mutation"; +import { LabelInputGroup } from "./Survey/LabelInputGroup"; + interface Label { name: string; display_text: string; help_text: string; } -interface LoadLabelsAction { - type: "load_labels"; - labels: Label[]; -} - -interface UpdateValueAction { - type: "update_value"; - label_index: number; - value: number; -} - -interface ToggleLabelAction { - type: "toggle_label"; - label_index: number; - check: boolean; -} - -interface LabelValue { - label: Label; - checked: boolean; - value: number; -} - -interface FlagReportState { - label_values: LabelValue[]; - submittable: boolean; -} - interface FlaggableElementProps { children: React.ReactNode; message: Message; } +interface ValidLabelsResponse { + valid_labels: Label[]; +} + export const FlaggableElement = (props: FlaggableElementProps) => { - const [report, updateReport] = useReducer( - (state: FlagReportState, action: LoadLabelsAction | UpdateValueAction | ToggleLabelAction): FlagReportState => { - const makeState = (label_values: LabelValue[]): FlagReportState => { - const submittable = label_values.map(({ checked }) => checked).some(Boolean); - return { label_values, submittable }; - }; + const { data: response } = useSWRImmutable("/api/valid_labels", get); + const { isOpen, onOpen, onClose } = useDisclosure(); + const { valid_labels } = response || { valid_labels: [] }; + const [values, setValues] = useState([]); - switch (action.type) { - case "load_labels": - return makeState( - action.labels.map((label) => { - return { label, checked: false, value: 1 }; - }) - ); - case "toggle_label": { - const values_copy = state.label_values.slice(); - values_copy[action.label_index].checked = action.check; - return makeState(values_copy); - } - case "update_value": { - const values_copy = state.label_values.slice(); - values_copy[action.label_index].value = action.value; - return makeState(values_copy); - } - } - }, - { label_values: [], submittable: false } - ); - const [isEditing, setIsEditing] = useBoolean(); - - const { data, isLoading } = useSWR("/api/valid_labels", get); - useEffect(() => { - if (isLoading) { - return; - } - if (!data) { - updateReport({ type: "load_labels", labels: [] }); - return; - } - const { valid_labels } = data; - updateReport({ type: "load_labels", labels: valid_labels }); - }, [data, isLoading]); + const submittable = + values.some((value) => { + return value !== null; + }) && + values.length === valid_labels.length && + valid_labels.length > 0; const { trigger } = useSWRMutation("/api/set_label", post, { - onSuccess: setIsEditing.off, + onSuccess: onClose, + onError: onClose, }); const submitResponse = () => { const label_map: Map = new Map(); - report.label_values.forEach(({ label, checked, value }) => { - if (checked) { - label_map.set(label.name, value); + console.assert(valid_labels.length === values.length); + values.forEach((value, idx) => { + if (value !== null) { + label_map.set(valid_labels[idx].name, value); } }); trigger({ @@ -131,22 +73,8 @@ export const FlaggableElement = (props: FlaggableElementProps) => { }); }; - const handleCheckboxState = (checked, label_index) => { - updateReport({ type: "toggle_label", label_index, check: checked }); - }; - const handleSliderState = (value, label_index) => { - updateReport({ type: "update_value", label_index, value }); - }; - return ( - + {props.children} @@ -154,33 +82,24 @@ export const FlaggableElement = (props: FlaggableElementProps) => { - - - - - - - - {report.label_values.map(({ label, checked, value }, i) => ( - - ))} - + + + + Select one or more labels that apply. + + + name)} onChange={setValues} /> + + - - - + + + ); }; - -interface FlagCheckboxProps { - label: Label; - idx: number; - checked: boolean; - sliderValue: number; - checkboxHandler: (newVal: boolean, idx: number) => void; - sliderHandler: (newVal: number, idx: number) => void; -} - -export function FlagCheckbox(props: FlagCheckboxProps): JSX.Element { - let AdditionalExplanation = null; - if (props.label.help_text) { - AdditionalExplanation = ( - - - ); - } - - const id = useId(); - const { colorMode } = useColorMode(); - - const labelTextClass = - colorMode === "light" - ? `text-${colors.light.text} hover:text-blue-700` - : `text-${colors.dark.text} hover:text-blue-400`; - - return ( - -
- { - props.checkboxHandler(e.target.checked, props.idx); - }} - /> - -
-
{ - if (!props.checked) { - props.checkboxHandler(true, props.idx); - } - }} - > - { - props.sliderHandler(val / 100, props.idx); - }} - > - - - - - -
-
- ); -} diff --git a/website/src/components/Header/Header.tsx b/website/src/components/Header/Header.tsx index a1b36123..0d70a442 100644 --- a/website/src/components/Header/Header.tsx +++ b/website/src/components/Header/Header.tsx @@ -1,10 +1,11 @@ import { Box, Button, Flex, Text } from "@chakra-ui/react"; +import { User } from "lucide-react"; import Image from "next/image"; import Link from "next/link"; import { useSession } from "next-auth/react"; import { useTranslation } from "next-i18next"; import { Flags } from "react-feature-flags"; -import { FaUser } from "react-icons/fa"; +import { LanguageSelector } from "src/components/LanguageSelector"; import { UserMenu } from "./UserMenu"; @@ -16,7 +17,7 @@ function AccountButton() { return ( - @@ -45,6 +46,7 @@ export function Header() { FlagTest + diff --git a/website/src/components/Header/UserMenu.tsx b/website/src/components/Header/UserMenu.tsx index 6fdde69e..8b5de035 100644 --- a/website/src/components/Header/UserMenu.tsx +++ b/website/src/components/Header/UserMenu.tsx @@ -11,16 +11,15 @@ import { Text, useColorModeValue, } from "@chakra-ui/react"; +import { AlertTriangle, Layout, LogOut, Settings, Shield } from "lucide-react"; import NextLink from "next/link"; import { signOut, useSession } from "next-auth/react"; import { useTranslation } from "next-i18next"; import React, { ElementType, useCallback } from "react"; -import { FiAlertTriangle, FiLayout, FiLogOut, FiSettings, FiShield } from "react-icons/fi"; interface MenuOption { name: string; href: string; - desc: string; icon: ElementType; isExternal: boolean; } @@ -40,22 +39,19 @@ export function UserMenu() { { name: t("dashboard"), href: "/dashboard", - desc: t("dashboard"), - icon: FiLayout, + icon: Layout, isExternal: false, }, { name: t("account_settings"), href: "/account", - desc: t("account_settings"), - icon: FiSettings, + icon: Settings, isExternal: false, }, { name: t("report_a_bug"), href: "https://github.com/LAION-AI/Open-Assistant/issues/new/choose", - desc: t("report_a_bug"), - icon: FiAlertTriangle, + icon: AlertTriangle, isExternal: true, }, ]; @@ -64,8 +60,7 @@ export function UserMenu() { options.unshift({ name: t("admin_dashboard"), href: "/admin", - desc: t("admin_dashboard"), - icon: FiShield, + icon: Shield, isExternal: false, }); } @@ -98,7 +93,7 @@ export function UserMenu() { _hover={{ textDecoration: "none" }} > - @@ -106,7 +101,7 @@ export function UserMenu() { - diff --git a/website/src/components/Icons/Discord.tsx b/website/src/components/Icons/Discord.tsx new file mode 100644 index 00000000..ea1118fb --- /dev/null +++ b/website/src/components/Icons/Discord.tsx @@ -0,0 +1,16 @@ +import { LucideIcon } from "lucide-react"; + +export const Discord: LucideIcon = ({ size = 24, ...rest }) => { + return ( + + + + ); +}; diff --git a/website/src/components/LanguageSelector/LanguageSelector.tsx b/website/src/components/LanguageSelector/LanguageSelector.tsx new file mode 100644 index 00000000..aea25dcd --- /dev/null +++ b/website/src/components/LanguageSelector/LanguageSelector.tsx @@ -0,0 +1,41 @@ +import { Select } from "@chakra-ui/react"; +import { useRouter } from "next/router"; +import { useTranslation } from "next-i18next"; +import { useCallback, useMemo } from "react"; +import cookie from "react-cookies"; + +const LanguageSelector = () => { + const router = useRouter(); + const { i18n } = useTranslation(); + + // Memo the set of locales and their display names. + const localesAndNames = useMemo(() => { + return router.locales.map((locale) => ({ + locale, + name: new Intl.DisplayNames([locale], { type: "language" }).of(locale), + })); + }, [router.locales]); + + const languageChanged = useCallback( + async (option) => { + const locale = option.target.value; + cookie.save("NEXT_LOCALE", locale, { path: "/" }); + const path = router.asPath; + return router.push(path, path, { locale }); + }, + [router] + ); + + const { language: currentLanguage } = i18n; + return ( + + ); +}; + +export { LanguageSelector }; diff --git a/website/src/components/LanguageSelector/index.tsx b/website/src/components/LanguageSelector/index.tsx new file mode 100644 index 00000000..feb9f322 --- /dev/null +++ b/website/src/components/LanguageSelector/index.tsx @@ -0,0 +1 @@ +export * from "./LanguageSelector"; diff --git a/website/src/components/Layout.tsx b/website/src/components/Layout.tsx index 55085550..1b5bf430 100644 --- a/website/src/components/Layout.tsx +++ b/website/src/components/Layout.tsx @@ -1,8 +1,8 @@ // https://nextjs.org/docs/basic-features/layouts import { Box, Grid } from "@chakra-ui/react"; +import { Activity, BarChart2, Layout, MessageSquare, Users } from "lucide-react"; import type { NextPage } from "next"; -import { FiBarChart2, FiLayout, FiMessageSquare, FiUsers, FiActivity } from "react-icons/fi"; import { Header } from "src/components/Header"; import { SlimFooter } from "./Dashboard/SlimFooter"; @@ -38,19 +38,19 @@ export const getDashboardLayout = (page: React.ReactElement) => ( label: "Dashboard", pathname: "/dashboard", desc: "Dashboard Home", - icon: FiLayout, + icon: Layout, }, { label: "Messages", pathname: "/messages", desc: "Messages Dashboard", - icon: FiMessageSquare, + icon: MessageSquare, }, { label: "Leaderboard", pathname: "/leaderboard", desc: "User Leaderboard", - icon: FiBarChart2, + icon: BarChart2, }, ]} > @@ -73,13 +73,13 @@ export const getAdminLayout = (page: React.ReactElement) => ( label: "Users", pathname: "/admin", desc: "Users Dashboard", - icon: FiUsers, + icon: Users, }, { label: "Status", pathname: "/admin/status", desc: "Status Dashboard", - icon: FiActivity, + icon: Activity, }, ]} > diff --git a/website/src/components/LeaderboardGridCell/LeaderboardGridCell.tsx b/website/src/components/LeaderboardGridCell/LeaderboardGridCell.tsx deleted file mode 100644 index 7886784a..00000000 --- a/website/src/components/LeaderboardGridCell/LeaderboardGridCell.tsx +++ /dev/null @@ -1,90 +0,0 @@ -import { Table, TableContainer, Tbody, Td, Text, Th, Thead, Tr, useColorModeValue } from "@chakra-ui/react"; -import { useTranslation } from "next-i18next"; -import React, { useMemo } from "react"; -import { useTable } from "react-table"; -import { get } from "src/lib/api"; -import { LeaderboardReply, LeaderboardTimeFrame } from "src/types/Leaderboard"; -import useSWRImmutable from "swr/immutable"; - -const getColumns = (t) => [ - { - Header: t("rank"), - accessor: "rank", - style: { width: "90px" }, - }, - { - Header: t("score"), - accessor: "leader_score", - style: { width: "90px" }, - }, - { - Header: t("user"), - accessor: "display_name", - }, -]; - -/** - * Presents a grid of leaderboard entries with more detailed information. - */ -const LeaderboardGridCell = ({ timeFrame }: { timeFrame: LeaderboardTimeFrame }) => { - const { t } = useTranslation(["leaderboard", "common"]); - const { data: reply } = useSWRImmutable(`/api/leaderboard?time_frame=${timeFrame}`, get, { - revalidateOnMount: true, - }); - - const columns = useMemo(() => getColumns(t), [t]); - - const { getTableProps, getTableBodyProps, headerGroups, rows, prepareRow } = useTable({ - columns, - data: reply?.leaderboard ?? [], - }); - - const backgroundColor = useColorModeValue("white", "gray.800"); - - const lastUpdated = useMemo(() => { - const val = new Date(reply?.last_updated); - return t("last_updated_at", { val, formatParams: { val: { dateStyle: "full", timeStyle: "short" } } }); - }, [t, reply?.last_updated]); - - if (!reply) { - return null; - } - - return ( - - - - {headerGroups.map((headerGroup, idx) => ( - - {headerGroup.headers.map((column) => ( - - ))} - - ))} - - - - {rows.map((row) => { - prepareRow(row); - return ( - - {row.cells.map((cell, idx) => { - return ( - - ); - })} - - ); - })} - -
- {column.render("Header")} -
- {cell.render("Cell")} -
- {lastUpdated} -
- ); -}; - -export { LeaderboardGridCell }; diff --git a/website/src/components/LeaderboardGridCell/index.tsx b/website/src/components/LeaderboardGridCell/index.tsx deleted file mode 100644 index c4657eb6..00000000 --- a/website/src/components/LeaderboardGridCell/index.tsx +++ /dev/null @@ -1 +0,0 @@ -export * from "./LeaderboardGridCell"; diff --git a/website/src/components/LeaderboardTable/LeaderboardTable.tsx b/website/src/components/LeaderboardTable/LeaderboardTable.tsx new file mode 100644 index 00000000..847e4dc4 --- /dev/null +++ b/website/src/components/LeaderboardTable/LeaderboardTable.tsx @@ -0,0 +1,64 @@ +import { CircularProgress } from "@chakra-ui/react"; +import { createColumnHelper } from "@tanstack/react-table"; +import { useTranslation } from "next-i18next"; +import React, { useMemo } from "react"; +import { get } from "src/lib/api"; +import { LeaderboardEntity, LeaderboardReply, LeaderboardTimeFrame } from "src/types/Leaderboard"; +import useSWRImmutable from "swr/immutable"; + +import { DataTable } from "../DataTable"; + +const columnHelper = createColumnHelper(); + +/** + * Presents a grid of leaderboard entries with more detailed information. + */ +export const LeaderboardTable = ({ timeFrame, limit }: { timeFrame: LeaderboardTimeFrame; limit: number }) => { + const { t } = useTranslation("leaderboard"); + const { + data: reply, + isLoading, + error, + } = useSWRImmutable(`/api/leaderboard?time_frame=${timeFrame}&limit=${limit}`, get, { + revalidateOnMount: true, + }); + + const columns = useMemo( + () => [ + columnHelper.accessor("rank", { + header: t("rank"), + }), + columnHelper.accessor("display_name", { + header: t("user"), + }), + columnHelper.accessor("leader_score", { + header: t("score"), + }), + columnHelper.accessor("prompts", { + header: t("prompt"), + }), + columnHelper.accessor((row) => row.replies_assistant + row.replies_prompter, { + header: t("reply"), + }), + columnHelper.accessor((row) => row.labels_full + row.labels_simple, { + header: t("label"), + }), + ], + [t] + ); + + const lastUpdated = useMemo(() => { + const val = new Date(reply?.last_updated); + return t("last_updated_at", { val, formatParams: { val: { dateStyle: "full", timeStyle: "short" } } }); + }, [t, reply?.last_updated]); + + if (isLoading) { + return ; + } + + if (error) { + return Unable to load leaderboard; + } + + return ; +}; diff --git a/website/src/components/LeaderboardTable/index.tsx b/website/src/components/LeaderboardTable/index.tsx new file mode 100644 index 00000000..d8de5feb --- /dev/null +++ b/website/src/components/LeaderboardTable/index.tsx @@ -0,0 +1 @@ +export * from "./LeaderboardTable"; diff --git a/website/src/components/Messages.tsx b/website/src/components/Messages.tsx index 0934c5af..c9d77e3c 100644 --- a/website/src/components/Messages.tsx +++ b/website/src/components/Messages.tsx @@ -20,7 +20,7 @@ export const Messages = ({ messages }: MessagesProps) => { return {items}; }; -export const MessageView = forwardRef((message: Message, ref) => { +export const MessageView = forwardRef, "div">((message: Partial, ref) => { const { colorMode } = useColorMode(); const bgColor = useMemo(() => { diff --git a/website/src/components/Messages/LabelPopup.tsx b/website/src/components/Messages/LabelPopup.tsx new file mode 100644 index 00000000..b2b95278 --- /dev/null +++ b/website/src/components/Messages/LabelPopup.tsx @@ -0,0 +1,76 @@ +import { + Button, + Modal, + ModalBody, + ModalCloseButton, + ModalContent, + ModalFooter, + ModalHeader, + ModalOverlay, +} from "@chakra-ui/react"; +import { useTranslation } from "next-i18next"; +import { useState } from "react"; +import { LabelInputGroup } from "src/components/Survey/LabelInputGroup"; +import { get, post } from "src/lib/api"; +import useSWRImmutable from "swr/immutable"; +import useSWRMutation from "swr/mutation"; + +interface LabelMessagePopupProps { + messageId: string; + show: boolean; + onClose: () => void; +} + +interface Label { + name: string; + display_text: string; + help_text: string; +} + +interface ValidLabelsResponse { + valid_labels: Label[]; +} + +export const LabelMessagePopup = ({ messageId, show, onClose }: LabelMessagePopupProps) => { + const { t } = useTranslation("message"); + const { data: response } = useSWRImmutable("/api/valid_labels", get); + const valid_labels = response?.valid_labels ?? []; + const [values, setValues] = useState(null); + + const { trigger: setLabels } = useSWRMutation("/api/set_label", post); + + const submit = () => { + const label_map: Map = new Map(); + console.assert(valid_labels.length === values.length); + values.forEach((value, idx) => { + if (value !== null) { + label_map.set(valid_labels[idx].name, value); + } + }); + setLabels({ + message_id: messageId, + label_map: Object.fromEntries(label_map), + }); + + setValues(null); + onClose(); + }; + + return ( + + + + {t("label_title")} + + + name)} onChange={setValues} /> + + + + + + + ); +}; diff --git a/website/src/components/Messages/MessageEmojiButton.stories.tsx b/website/src/components/Messages/MessageEmojiButton.stories.tsx new file mode 100644 index 00000000..d74836d5 --- /dev/null +++ b/website/src/components/Messages/MessageEmojiButton.stories.tsx @@ -0,0 +1,34 @@ +import React from "react"; + +import { MessageEmojiButton } from "./MessageEmojiButton"; + +// eslint-disable-next-line import/no-anonymous-default-export +export default { + title: "Messages/MessageEmojiButton", + component: MessageEmojiButton, +}; + +const Template = ({ emoji, count, checked }: { emoji: string; count: number; checked?: boolean }) => { + return ; +}; + +export const Default = Template.bind({}); +Default.args = { + emoji: "+1", + count: 7, + checked: false, +}; + +export const BigNumber = Template.bind({}); +BigNumber.args = { + emoji: "+1", + count: 999, + checked: false, +}; + +export const Checked = Template.bind({}); +Checked.args = { + emoji: "+1", + count: 2, + checked: true, +}; diff --git a/website/src/components/Messages/MessageEmojiButton.tsx b/website/src/components/Messages/MessageEmojiButton.tsx new file mode 100644 index 00000000..8b5c9ff7 --- /dev/null +++ b/website/src/components/Messages/MessageEmojiButton.tsx @@ -0,0 +1,48 @@ +import { Button } from "@chakra-ui/react"; +import { BoxSelect, Flag, LucideProps, ThumbsDown, ThumbsUp } from "lucide-react"; +import { ReactElement } from "react"; +import { MessageEmoji } from "src/types/Conversation"; + +type EmojiIconPurpose = "MINI_BUTTON" | "NORMAL"; + +const defaultIconProps: (purpose: EmojiIconPurpose) => LucideProps = (purpose: EmojiIconPurpose) => { + if (purpose === "MINI_BUTTON") return { height: "1em" }; + return {}; +}; + +export const getEmojiIcon = (name: string, purpose: EmojiIconPurpose): ReactElement => { + switch (name) { + case "+1": + return ; + case "-1": + return ; + case "flag": + case "red_flag": + return ; + default: + return ; + } +}; + +interface MessageEmojiButtonProps { + emoji: MessageEmoji; + checked?: boolean; + onClick: () => void; +} + +export const MessageEmojiButton = ({ emoji, checked, onClick }: MessageEmojiButtonProps) => { + return ( + + ); +}; diff --git a/website/src/components/Messages/MessageTable.stories.tsx b/website/src/components/Messages/MessageTable.stories.tsx new file mode 100644 index 00000000..bc03aed1 --- /dev/null +++ b/website/src/components/Messages/MessageTable.stories.tsx @@ -0,0 +1,110 @@ +import React from "react"; +import { Message } from "src/types/Conversation"; + +import { MessageTable } from "./MessageTable"; + +// eslint-disable-next-line import/no-anonymous-default-export +export default { + title: "Messages/MessageTable", + component: MessageTable, +}; + +const Template = ({ + messages, + enableLink, + highlightLastMessage, +}: { + messages: Message[]; + enableLink: boolean; + highlightLastMessage: boolean; +}) => { + return ; +}; + +export const Default = Template.bind({}); +Default.args = { + messages: [ + { + text: "I'm unsure how to interpret this. Is it a riddle?", + is_assistant: true, + id: "", + frontend_message_id: "", + emojis: {}, + user_emojis: [], + }, + { + text: "No, I just wanted to see how you reply when I type random characters. Can you tell me who invented Wikipedia?", + is_assistant: false, + id: "", + frontend_message_id: "", + emojis: { "-1": 11, red_flag: 2 }, + user_emojis: [], + }, + { + text: "Sorry, my cat sat on my keyboard. Can you print a cat in ASCII art?", + is_assistant: false, + id: "", + frontend_message_id: "", + emojis: {}, + user_emojis: [], + }, + ], + enableLink: true, + highlightLastMessage: false, +}; + +export const Conversation = Template.bind({}); +Conversation.args = { + messages: [ + { + text: "Hello! How can I help you?", + is_assistant: true, + id: "", + frontend_message_id: "", + emojis: {}, + user_emojis: [], + }, + { + text: "Who were the 8 presidents before George Washington?", + is_assistant: false, + id: "", + frontend_message_id: "", + emojis: {}, + user_emojis: [], + }, + ], + enableLink: false, + highlightLastMessage: true, +}; + +export const LongText = Template.bind({}); +LongText.args = { + messages: [ + { + text: "Okay, here\u2019s my answer. The thing is, in a job interview you are applying to the position, right? Which means that you need to try to make yourself sound as good a fit as possible. So that means it\u2019s a mistake to wear something that people would think of as a silly or stupid outfit. It\u2019s good to wear something in line with what people expect the job to require, and ideally you want to wear something that you can walk around comfortably in. At the same time, if you wear something that looks kind of dorky, you might not stand out as much as someone who doesn\u2019t, because the other candidates probably all have something slightly weird or wacky about them too. So, I\u2019d say, do you like your outfit? Wear something that you\u2019re comfortable in, that you think you look good in, and is in line with what the job would require. Also, for interview prep, I would focus on making yourself look as professional as possible, from your hair to your outfit. What do you think of this?", + is_assistant: true, + id: "", + frontend_message_id: "", + emojis: {}, + user_emojis: [], + }, + { + text: "Yes, I think they can be helpful when the child misbehaves, but they should be used with a little bit of compassion and understanding that it\u2019s not the natural state of things to have an adult yelling at them. Time outs are also often used without letting the child know how they\u2019re getting out of the time out, which can make it feel arbitrary or like a punishment, rather than a consequence for something they did. It\u2019s really easy for adults to do this kind of thing unconsciously. It\u2019s easy to get caught up in the notion that \u201cThey\u2019re in time out, and that\u2019s the end of it!\u201d but kids can be pretty imaginative, and they can use their own creativity to make their way out of time outs. A compassionate time out ends when the child shows a sign of understanding what they\u2019ve done wrong, and are ready to begin again. That way the child knows they\u2019re learning, and that the parent is seeing them as an intelligent person, even if they sometimes mess up. You can still use the other techniques you were using to be tough when necessary, but using a compassionate approach will let you use them without actually using them!", + is_assistant: false, + id: "", + frontend_message_id: "", + emojis: {}, + user_emojis: [], + }, + { + text: "No. The USA was founded by a Puritan group of Protestants, but it didn\u2019t adopt the religion of the Puritans until much later, and it was always a secular state. The Puritans observed the Sabbath on Sunday, and the Puritans only had a small influence in the early history of the USA. It\u2019s difficult to trace the origins of closing stores on Sunday, but one early and short-lived attempt at forcing the Sabbath on people in the 1800s was motivated by the Protestant ideal that people should spend Sunday focusing on spiritual activities. By the mid-1800s, when the Sunday closing law was made, there was not a lot of pressure from that standpoint, but the church had begun to advocate for Sunday closing laws as a way of counteracting the negative effects of industrialization on the day of rest. Even after that shift, closing stores on Sunday was not always possible, since the religious Sunday was not always chosen for observance. And as industrialization accelerated and mechanization made it possible to operate stores on Sunday, the law was not enforced as much as people liked. The day of rest was also being violated by stores that stayed open all day on Sunday, so closing stores on Sundays became an effort to protect the Sabbath for all citizens.", + is_assistant: false, + id: "", + frontend_message_id: "", + emojis: {}, + user_emojis: [], + }, + ], + enableLink: true, + highlightLastMessage: false, +}; diff --git a/website/src/components/Messages/MessageTable.tsx b/website/src/components/Messages/MessageTable.tsx index ed98752c..2d39f346 100644 --- a/website/src/components/Messages/MessageTable.tsx +++ b/website/src/components/Messages/MessageTable.tsx @@ -5,13 +5,19 @@ import { Message } from "src/types/Conversation"; interface MessageTableProps { messages: Message[]; enableLink?: boolean; + highlightLastMessage?: boolean; } -export function MessageTable({ messages, enableLink }: MessageTableProps) { +export function MessageTable({ messages, enableLink, highlightLastMessage }: MessageTableProps) { return ( - {messages.map((item) => ( - + {messages.map((message, idx) => ( + ))} ); diff --git a/website/src/components/Messages/MessageTableEntry.stories.tsx b/website/src/components/Messages/MessageTableEntry.stories.tsx new file mode 100644 index 00000000..3550d00f --- /dev/null +++ b/website/src/components/Messages/MessageTableEntry.stories.tsx @@ -0,0 +1,62 @@ +import React from "react"; +import { Message } from "src/types/Conversation"; + +import { MessageTableEntry } from "./MessageTableEntry"; + +// eslint-disable-next-line import/no-anonymous-default-export +export default { + title: "Messages/MessageTableEntry", + component: MessageTableEntry, +}; + +const Template = ({ enabled, highlight, ...message }) => { + return ; +}; + +export const Default = Template.bind({}); +Default.args = { + text: "Who were the 8 presidents before George Washington?", + is_assistant: false, + id: "", + frontend_message_id: "", + enabled: true, + highlight: false, + emojis: {}, + user_emojis: [], +}; + +export const Asistant = Template.bind({}); +Asistant.args = { + text: "Who were the 8 presidents before George Washington?", + is_assistant: true, + id: "", + frontend_message_id: "", + enabled: true, + highlight: false, + emojis: {}, + user_emojis: [], +}; + +export const LongText = Template.bind({}); +LongText.args = { + text: "Assistant: No. The USA was founded by a Puritan group of Protestants, but it didn\u2019t adopt the religion of the Puritans until much later, and it was always a secular state. The Puritans observed the Sabbath on Sunday, and the Puritans only had a small influence in the early history of the USA. It\u2019s difficult to trace the origins of closing stores on Sunday, but one early and short-lived attempt at forcing the Sabbath on people in the 1800s was motivated by the Protestant ideal that people should spend Sunday focusing on spiritual activities. By the mid-1800s, when the Sunday closing law was made, there was not a lot of pressure from that standpoint, but the church had begun to advocate for Sunday closing laws as a way of counteracting the negative effects of industrialization on the day of rest. Even after that shift, closing stores on Sunday was not always possible, since the religious Sunday was not always chosen for observance. And as industrialization accelerated and mechanization made it possible to operate stores on Sunday, the law was not enforced as much as people liked. The day of rest was also being violated by stores that stayed open all day on Sunday, so closing stores on Sundays became an effort to protect the Sabbath for all citizens.", + is_assistant: true, + id: "", + frontend_message_id: "", + enabled: true, + highlight: false, + emojis: {}, + user_emojis: [], +}; + +export const WithEmoji = Template.bind({}); +WithEmoji.args = { + text: "As you\u2019ve mentioned, Star Wars has many sequels, prequels, and crossovers. The official list of movies in Star Wars is:", + is_assistant: true, + id: "", + frontend_message_id: "", + enabled: true, + highlight: false, + emojis: { "-1": 5, "+1": 1 }, + user_emojis: ["-1"], +}; diff --git a/website/src/components/Messages/MessageTableEntry.tsx b/website/src/components/Messages/MessageTableEntry.tsx index 1205991e..7202903a 100644 --- a/website/src/components/Messages/MessageTableEntry.tsx +++ b/website/src/components/Messages/MessageTableEntry.tsx @@ -1,22 +1,47 @@ -import { Avatar, Box, HStack, LinkBox, useBreakpoint, useBreakpointValue, useColorModeValue } from "@chakra-ui/react"; +import { + Avatar, + Box, + HStack, + Menu, + MenuButton, + MenuDivider, + MenuGroup, + MenuItem, + MenuList, + SimpleGrid, + useBreakpointValue, + useColorModeValue, + useDisclosure, +} from "@chakra-ui/react"; import { boolean } from "boolean"; -import Link from "next/link"; +import { ClipboardList, Flag, MessageSquare, MoreHorizontal } from "lucide-react"; import { useRouter } from "next/router"; -import { useCallback, useMemo } from "react"; -import { FlaggableElement } from "src/components/FlaggableElement"; -import { Message } from "src/types/Conversation"; +import { useTranslation } from "next-i18next"; +import { useCallback, useEffect, useMemo, useState } from "react"; +import { LabelMessagePopup } from "src/components/Messages/LabelPopup"; +import { getEmojiIcon, MessageEmojiButton } from "src/components/Messages/MessageEmojiButton"; +import { ReportPopup } from "src/components/Messages/ReportPopup"; +import { post } from "src/lib/api"; +import { Message, MessageEmojis } from "src/types/Conversation"; +import { colors } from "styles/Theme/colors"; +import useSWRMutation from "swr/mutation"; interface MessageTableEntryProps { - item: Message; + message: Message; enabled?: boolean; + highlight?: boolean; } -export function MessageTableEntry(props: MessageTableEntryProps) { +export function MessageTableEntry({ message, enabled, highlight }: MessageTableEntryProps) { const router = useRouter(); + const [emojiState, setEmojis] = useState({ emojis: {}, user_emojis: [] }); + useEffect(() => { + setEmojis({ emojis: message.emojis, user_emojis: message.user_emojis }); + }, [message.emojis, message.user_emojis]); - const { item } = props; - - const goToMessage = useCallback(() => router.push(`/messages/${item.id}`), [router, item.id]); + const goToMessage = useCallback(() => router.push(`/messages/${message.id}`), [router, message.id]); + const { isOpen: reportPopupOpen, onOpen: showReportPopup, onClose: closeReportPopup } = useDisclosure(); + const { isOpen: labelPopupOpen, onOpen: showLabelPopup, onClose: closeLabelPopup } = useDisclosure(); const backgroundColor = useColorModeValue("gray.100", "gray.700"); const backgroundColor2 = useColorModeValue("#DFE8F1", "#42536B"); @@ -31,31 +56,124 @@ export function MessageTableEntry(props: MessageTableEntryProps) { borderColor={borderColor} size={inlineAvatar ? "xs" : "sm"} mr={inlineAvatar ? 2 : 0} - name={`${boolean(item.is_assistant) ? "Assistant" : "User"}`} - src={`${boolean(item.is_assistant) ? "/images/logos/logo.png" : "/images/temp-avatars/av1.jpg"}`} + name={`${boolean(message.is_assistant) ? "Assistant" : "User"}`} + src={`${boolean(message.is_assistant) ? "/images/logos/logo.png" : "/images/temp-avatars/av1.jpg"}`} /> ), - [borderColor, inlineAvatar, item.is_assistant] + [borderColor, inlineAvatar, message.is_assistant] ); + const highlightColor = useColorModeValue(colors.light.highlight, colors.dark.highlight); + + const { trigger: sendEmojiChange } = useSWRMutation(`/api/messages/${message.id}/emoji`, post, { + onSuccess: setEmojis, + }); + const react = (emoji: string, state: boolean) => { + sendEmojiChange({ op: state ? "add" : "remove", emoji }); + }; return ( - - - {!inlineAvatar && avatar} - + {!inlineAvatar && avatar} + + {inlineAvatar && avatar} + {message.text} + e.stopPropagation()} > - {inlineAvatar && avatar} - {item.text} - - - + {Object.entries(emojiState.emojis).map(([emoji, count]) => ( + react(emoji, !emojiState.user_emojis.includes(emoji))} + /> + ))} + + + + +
+ ); } + +const EmojiMenuItem = ({ + emoji, + checked, + react, +}: { + emoji: string; + checked?: boolean; + react: (emoji: string, state: boolean) => void; +}) => { + const activeColor = useColorModeValue(colors.light.active, colors.dark.active); + + return ( + react(emoji, !checked)} justifyContent="center" color={checked ? activeColor : undefined}> + {getEmojiIcon(emoji, "NORMAL")} + + ); +}; + +const MessageActions = ({ + react, + userEmoji, + onLabel, + onReport, + messageId, +}: { + react: (emoji: string, state: boolean) => void; + userEmoji: string[]; + onLabel: () => void; + onReport: () => void; + messageId: string; +}) => { + const { t } = useTranslation("message"); + + return ( + + + + + + + + {["+1", "-1"].map((emoji) => ( + + ))} + + + + }> + {t("label_action")} + + }> + {t("report_action")} + + + }> + {t("open_new_tab_action")} + + + + ); +}; diff --git a/website/src/components/Messages/MessageWithChildren.tsx b/website/src/components/Messages/MessageWithChildren.tsx index ca29c410..f47cfac5 100644 --- a/website/src/components/Messages/MessageWithChildren.tsx +++ b/website/src/components/Messages/MessageWithChildren.tsx @@ -52,7 +52,7 @@ export function MessageWithChildren(props: MessageWithChildrenProps) { {isFirst ? "Message" : depth === 1 ? "Children" : "Ancestor"} - + @@ -86,9 +86,9 @@ export function MessageWithChildren(props: MessageWithChildrenProps) { gap="4" shadow="base" > - {children.map((item, idx) => ( + {children.map((message, idx) => ( - + ))} diff --git a/website/src/components/Messages/ReportPopup.tsx b/website/src/components/Messages/ReportPopup.tsx new file mode 100644 index 00000000..67a2ea1b --- /dev/null +++ b/website/src/components/Messages/ReportPopup.tsx @@ -0,0 +1,56 @@ +import { + Button, + Modal, + ModalBody, + ModalCloseButton, + ModalContent, + ModalFooter, + ModalHeader, + ModalOverlay, + Textarea, +} from "@chakra-ui/react"; +import { useTranslation } from "next-i18next"; +import { useState } from "react"; +import { post } from "src/lib/api"; +import useSWRMutation from "swr/mutation"; + +interface ReportPopupProps { + messageId: string; + show: boolean; + onClose: () => void; +} + +export const ReportPopup = ({ messageId, show, onClose }: ReportPopupProps) => { + const { t } = useTranslation("message"); + const [text, setText] = useState(""); + const { trigger } = useSWRMutation("/api/report", post); + + const submit = () => { + trigger({ + message_id: messageId, + text, + }); + + setText(""); + onClose(); + }; + + return ( + + + + {t("report_title")} + + +