From d1d185edb6a4e3b3787f09a8fe0e6ea47422c80d Mon Sep 17 00:00:00 2001
From: Yada Pruksachatkun <yada.pruksachatkun@gmail.com>
Date: Mon, 23 Jan 2023 05:24:43 -0800
Subject: [PATCH] Adding MT Sample clinical note dataset (#804)

* Adding clinical note dataset

* Fix flake8 issues

* Fix prepare.py for straggling commas, replace assistant with Rosey in prompt

Co-authored-by: Yada P <yadapruk@Tests-MacBook-Pro.local>
---
 .../datasets/mt_note_generation/README.md     | 101 ++++++++++++++
 .../datasets/mt_note_generation/__init__.py   |   0
 .../datasets/mt_note_generation/hub.py        |  21 +++
 .../mt_note_generation/mt_note_generation.py  | 123 ++++++++++++++++++
 .../datasets/mt_note_generation/prepare.py    |  84 ++++++++++++
 train_toxicity_model.py                       |   0
 6 files changed, 329 insertions(+)
 create mode 100644 openassistant/datasets/mt_note_generation/README.md
 create mode 100644 openassistant/datasets/mt_note_generation/__init__.py
 create mode 100644 openassistant/datasets/mt_note_generation/hub.py
 create mode 100644 openassistant/datasets/mt_note_generation/mt_note_generation.py
 create mode 100644 openassistant/datasets/mt_note_generation/prepare.py
 create mode 100644 train_toxicity_model.py

diff --git a/openassistant/datasets/mt_note_generation/README.md b/openassistant/datasets/mt_note_generation/README.md
new file mode 100644
index 00000000..74754417
--- /dev/null
+++ b/openassistant/datasets/mt_note_generation/README.md
@@ -0,0 +1,101 @@
+---
+annotations_creators:
+  - no-annotation
+language:
+  - en
+language_creators:
+  - machine-generated
+license:
+  - mit
+multilinguality:
+  - monolingual
+pretty_name: "MT Note Generation"
+size_categories:
+  - <500
+source_datasets:
+  - mt_samples
+tags:
+  - open-assistant
+task_categories:
+  - conversational
+task_ids:
+  - dialogue-generation
+dataset_info:
+  features:
+    - name: conversation
+      dtype: string
+  splits:
+    - name: train
+      num_bytes: 1060800
+      num_examples: 270
+    - name: test
+      num_bytes: 339323
+      num_examples: 90
+    - name: validation
+      num_bytes: 337018
+      num_examples: 90
+---
+
+# Dataset Card for MT Samples Clinical Note Generation
+
+## Dataset Description
+
+- **Repository:**
+  [Code](https://github.com/LAION-AI/Open-Assistant/openassistant/datasets/mt_note_generation/)
+
+## Dataset Summary
+
+MT Note Generation is a set of synthetic dialogues between Assistant and User
+where the user asks the assistant to generate a clinical note for a patient
+persona.
+
+This data was created by synthesizing the dialogues in
+[MT Samples](https://mtsamples.com/) for clinical note generation.
+
+Example:
+
+```
+User: Write a clinical note about a patient with the following chief complaint:  Stomach pain for 2 weeks.
+
+ASSISTANT:
+
+CHIEF COMPLAINT: Stomach pain for 2 weeks. HISTORY OF PRESENT ILLNESS: The patient is a 45yo Mexican man without significant past medical history who presents to the emergency room with complaints of mid-epigastric and right upper quadrant abdominal pain for the last 14 days. The pain was initially crampy and burning in character and was relieved with food intake. He also reports that it initially was associated with a sour taste in his mouth. He went to his primary care physician who prescribed cimetidine 400mg qhs x 5 days; however, this did not relieve his symptoms. In fact, the pain has worsened such that the pain now radiates to the back but is waxing and waning in duration. It is relieved with standing and ambulation and exacerbated when lying in a supine position. He reports a decrease in appetite associated with a 4 lb. wt loss over the last 2 wks. He does have nausea with only one episode of non-bilious, non-bloody emesis on day of admission. He reports a 2 wk history of subjective fever and diaphoresis. He denies any diarrhea, constipation, dysuria, melena, or hematochezia. His last bowel movement was during the morning of admission and was normal. He denies any travel in the last 9 years and sick contacts.PAST MEDICAL HISTORY: Right inguinal groin cyst removal 15 years ago. Unknown etiology. No recurrence.
+
+PAST SURGICAL HISTORY: Left femoral neck fracture with prosthesis secondary to a fall 4 years ago.
+
+FAMILY HISTORY: Mother with diabetes. No history of liver disease. No malignancies.
+
+SOCIAL HISTORY: The patient was born in central Mexico but moved to the United States 9 years ago. He is on disability due to his prior femoral fracture. He denies any tobacco or illicit drug use. He only drinks alcohol socially, no more than 1 drink every few weeks. He is married and has 3 healthy children. He denies any tattoos or risky sexual behavior.
+
+ALLERGIES: NKDA.
+
+MEDICATIONS: Tylenol prn (1-2 tabs every other day for the last 2 wks), Cimetidine 400mg po qhs x 5 days.
+
+REVIEW OF SYSTEMS:  No headache, vision changes. No shortness of breath. No chest pain or palpitations.
+
+PHYSICAL EXAMINATION: Vitals: T 100.9-102.7 BP 136/86 Pulse 117 RR 12 98% sat on room air,Gen: Well-developed, well-nourished, no apparent distress.HEENT: Pupils equal, round and reactive to light. Anicteric. Oropharynx clear and moist.Neck: Supple. No lymphadenopathy or carotid bruits. No thyromegaly or masses.CHEST: Clear to auscultation bilaterally.CV: Tachycardic but regular rhythm, normal S1/S2, no murmurs/rubs/gallops.Abd: Soft, active bowel sounds. Tender in the epigastrium and right upper quadrant with palpation associated with slight guarding. No rebound tenderness. No hepatomegaly. No splenomegaly.Rectal: Stool was brown and guaiac negative.Ext: No cyanosis/clubbing/edema.Neurological: He was alert and oriented x3. CN II-XII intact. Normal 2+ DTRs. No focal neurological deficit.Skin: No jaundice. No skin rashes or lesions.
+
+IMAGING DATA:CT Abdomen with contrast ( 11/29/03 ):  There is a 6x6 cm multilobular hypodense mass seen at the level of the hepatic hilum and caudate lobe which is resulting in mass effect with dilatation of the intrahepatic radicals of the left lobe of the liver. The rest of the liver parenchyma is homogeneous. The gallbladder, pancreas, spleen, adrenal glands and kidneys are within normal limits. The retroperitoneal vascular structures are within normal limits. There is no evidence of lymphadenopathy, free fluid or fluid collections.HOSPITAL COURSE:  The patient was admitted to the hospital for further evaluation. A diagnostic procedure was performed.
+```
+
+## Usage
+
+The dataset contains one configuration, `dialogue_modeling`, which has a single
+text `conversation` feature.
+
+## Source data
+
+The script modifies data from mtsamples.csv which is hosted in Kaggle:
+https://www.kaggle.com/datasets/tboyle10/medicaltranscriptions
+
+## Citation
+
+Please cite our work if you find the resources in this repository useful:
+
+```
+@article{pruks2023mtsamplesnotegen,
+  author    = {Yada Pruksachatkun},
+  title     = {MT Samples Note Generation},
+  year      = {2023}
+}
+```
diff --git a/openassistant/datasets/mt_note_generation/__init__.py b/openassistant/datasets/mt_note_generation/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/openassistant/datasets/mt_note_generation/hub.py b/openassistant/datasets/mt_note_generation/hub.py
new file mode 100644
index 00000000..296ecee4
--- /dev/null
+++ b/openassistant/datasets/mt_note_generation/hub.py
@@ -0,0 +1,21 @@
+from dataclasses import dataclass
+
+import datasets
+
+
+@dataclass
+class OpenAssistantConfig(datasets.BuilderConfig):
+    """BuilderConfig for OpenAssistant datasets."""
+
+    name: str = None
+    version: datasets.Version = None
+    description: str = None
+    schema: str = None
+    subset_id: str = None
+
+
+features = datasets.Features(
+    {
+        "conversation": datasets.Value("string"),
+    }
+)
diff --git a/openassistant/datasets/mt_note_generation/mt_note_generation.py b/openassistant/datasets/mt_note_generation/mt_note_generation.py
new file mode 100644
index 00000000..d78b91b8
--- /dev/null
+++ b/openassistant/datasets/mt_note_generation/mt_note_generation.py
@@ -0,0 +1,123 @@
+# Copyright 2023 The OpenAssistant Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+MT Note Generation is a set of synthetic dialogues between Assistant and
+User where the user asks the assistant to generate a clinical note for a patient persona.
+"""
+
+import json
+from typing import Dict, List, Tuple
+
+import datasets
+
+from .hub import OpenAssistantConfig, features
+
+_CITATION = """\
+ @misc{transcribed medical transcription sample reports and examples, title={Welcome to MTSamples},
+ url={https://mtsamples.com/},
+ journal={Transcribed Medical Transcription Sample Reports and Examples}}
+"""
+
+_DATASETNAME = "mt_note_generation"
+_DISPLAYNAME = "MT Samples Note Generation"
+
+_DESCRIPTION = """\
+A dataset of instructions for generating clinical notes from MT samples.
+"""
+
+_HOMEPAGE = ""
+
+_LICENSE = "mit"
+
+_URLS = {
+    _DATASETNAME: {
+        "train": "./data/mt_note_generation_train.jsonl",
+        "test": "./data/mt_note_generation_test.jsonl",
+        "validation": "./data/mt_note_generation_validation.jsonl",
+    }
+}
+
+_SUPPORTED_TASKS = ["dialogue-modeling"]
+
+_VERSION = "1.0.0"
+
+
+class MTNoteGenerationDataset(datasets.GeneratorBasedBuilder):
+    """A set of dialogues synthesized from the MT Samples dataset."""
+
+    VERSION = datasets.Version(_VERSION)
+
+    BUILDER_CONFIGS = [
+        OpenAssistantConfig(
+            name=f"{_DATASETNAME}_dialogue_modeling",
+            version=VERSION,
+            description=f"OpenAssistant dataset config for {_DATASETNAME}",
+            schema="dialogue_modeling",
+            subset_id=_DATASETNAME,
+        )
+    ]
+
+    DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_dialogue_modeling"
+
+    def _info(self) -> datasets.DatasetInfo:
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
+
+        urls = _URLS[_DATASETNAME]
+        data_dir = dl_manager.download_and_extract(urls)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                # Whatever you put in gen_kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "filepath": data_dir,
+                    "split": "train",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepath": data_dir,
+                    "split": "test",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "filepath": data_dir,
+                    "split": "validation",
+                },
+            ),
+        ]
+
+    def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]:
+        """Yields examples as (key, example) tuples."""
+        if self.config.schema == "dialogue_modeling":
+            key = 0
+            with open(filepath[split], "r", encoding="utf8") as data:
+                while True:
+                    line = data.readline()
+                    if not line:
+                        return
+                    yield key, json.loads(line)
+                    key += 1
diff --git a/openassistant/datasets/mt_note_generation/prepare.py b/openassistant/datasets/mt_note_generation/prepare.py
new file mode 100644
index 00000000..7f0a9146
--- /dev/null
+++ b/openassistant/datasets/mt_note_generation/prepare.py
@@ -0,0 +1,84 @@
+import json
+import math
+import os
+import random
+import re
+import sys
+from string import punctuation
+
+import kaggle
+import pandas as pd
+
+CLINICAL_NOTE_GENERATION_TEMPLATE = """User: Write a clinical note about a patient with the following {section}: {section_information}.
+Rosey: {note}"""
+
+
+def preprocess(mt_dataset):
+    def filter_for_notes(row):
+        normalized_transcript = row["transcription"].lower()
+        if "chief complaint:" in normalized_transcript:
+            return True
+        return False
+
+    mt_dataset = mt_dataset.dropna(subset=["description", "transcription"])
+    mt_note_subset = mt_dataset[mt_dataset.apply(filter_for_notes, axis=1)]
+    return mt_note_subset
+
+
+def is_chief_complaint(section):
+    return "chief complaint" in section.lower()
+
+
+def get_conversations(dataset):
+    def normalize_transcript(x):
+        x = re.sub(r"\.+", ".", x)
+        x = re.sub(r"\,+", ",", x)
+        x = re.sub(r":\s+", ": ", x)
+        x = re.sub(r"\.\s+", ". ", x)
+        x = re.sub(r":(\s)*\,+", ": ", x)
+        x = re.sub(r"\.\,+", ". ", x)
+        return x
+
+    conversations = []
+    for idx in range(len(dataset)):
+        transcript = normalize_transcript(dataset.iloc[idx]["transcription"])
+        sections = re.findall(r"\b[A-Z]+(?: [A-Z]+)*:", transcript)
+        if len(sections) >= 2:
+            note_prompt = transcript.split(sections[0])[1].split(sections[1])[0]
+        else:
+            continue
+        section_name = sections[0].lower().strip(punctuation)
+        if len(note_prompt.split(" ")) > 30 and is_chief_complaint(section_name):
+            # There are some chief complaints that seem to be HPI
+            section_name = "history of present illness"
+        conversations.append(
+            CLINICAL_NOTE_GENERATION_TEMPLATE.format(
+                section=section_name, section_information=note_prompt, note=transcript
+            )
+        )
+    return conversations
+
+
+def main(output_dir: str = "data"):
+    """Download and prepare the dataset for use."""
+    os.makedirs(output_dir, exist_ok=True)
+    kaggle.api.dataset_download_files("tboyle10/medicaltranscriptions", "data", unzip=True)
+    mt_samples = preprocess(pd.read_csv("mtsamples.csv"))
+    conversations = get_conversations(mt_samples)
+    random.shuffle(conversations)
+    train_limit = math.ceil(len(conversations) * 0.6)
+    dev_limit = math.ceil(len(conversations) * 0.8)
+    train, validation, test = (
+        conversations[:train_limit],
+        conversations[train_limit:dev_limit],
+        conversations[dev_limit:],
+    )
+    splits = {"train": train, "validation": validation, "test": test}
+    for split in ["train", "validation", "test"]:
+        with open(f"{output_dir}/mt_note_generation_{split}.jsonl", "w") as f:
+            for conversation in splits[split]:
+                f.write(f"{json.dumps({'conversation': conversation})}\n")
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/train_toxicity_model.py b/train_toxicity_model.py
new file mode 100644
index 00000000..e69de29b