From 994dce7c766db0723488d98224a71ef39665be3d Mon Sep 17 00:00:00 2001
From: MattAlexMiracle <Alexander.Mattick@googlemail.com>
Date: Sat, 28 Jan 2023 00:56:56 +0100
Subject: [PATCH] language classifier (#963)

Co-authored-by: Alexander Mattick <alex.mattick@fau.de>
---
 .../utils/language_classification.py          | 111 ++++++++++++++++++
 1 file changed, 111 insertions(+)
 create mode 100644 backend/oasst_backend/utils/language_classification.py

diff --git a/backend/oasst_backend/utils/language_classification.py b/backend/oasst_backend/utils/language_classification.py
new file mode 100644
index 00000000..ec75b9c3
--- /dev/null
+++ b/backend/oasst_backend/utils/language_classification.py
@@ -0,0 +1,111 @@
+import os
+import pickle
+from collections import Counter
+
+from sklearn import metrics
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+from sklearn.svm import LinearSVC
+
+
+def load_and_split(foldername, num_words):
+    ls = os.listdir(foldername)
+    X = []
+    Y = []
+    langmap = dict()
+    for idx, x in enumerate(ls):
+        print("loading language", x)
+        with open(foldername + "/" + x, "r") as reader:
+            tmp = reader.read().split(" ")
+            tmp = [" ".join(tmp[i : i + num_words]) for i in range(0, 100_000, num_words)]
+            X.extend(tmp)
+            Y.extend([idx] * len(tmp))
+            langmap[idx] = x
+    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.90)
+    return x_train, x_test, y_train, y_test, langmap
+
+
+def build_and_train_pipeline(x_train, y_train):
+    vectorizer = TfidfVectorizer(ngram_range=(1, 2), analyzer="char", use_idf=False)
+    clf = Pipeline(
+        [
+            ("vec", vectorizer),
+            # ("nystrom", Nystroem(n_components=1000,n_jobs=6)),
+            ("clf", LinearSVC(C=0.5)),
+            # ("clf",GaussianNB())
+            # ("clf", HistGradientBoostingClassifier())
+        ]
+    )
+    print("fitting model...")
+    clf.fit(x_train, y_train)
+    return clf
+
+
+def benchmark(clf, x_test, y_test, langmap):
+    print("benchmarking model...")
+    y_pred = clf.predict(x_test)
+    names = list(langmap.values())
+    # print(y_test)
+    # print(langmap)
+    print(metrics.classification_report(y_test, y_pred, target_names=names))
+    cm = metrics.confusion_matrix(y_test, y_pred)
+    print(cm)
+
+
+def main(foldername, modelname, num_words):
+    x_train, x_test, y_train, y_test, langmap = load_and_split(foldername=foldername, num_words=num_words)
+    clf = build_and_train_pipeline(x_train, y_train)
+    benchmark(clf, x_test, y_test, langmap)
+    save_model(clf, langmap, num_words, modelname)
+    model = load(modelname)
+    print(
+        "running infernence on long tests",
+        inference_voter(
+            model,
+            """
+    What language is this text written in? Nobody knows until you fill in at least ten words.
+    This test here is to check whether the moving window approach works,
+    so I still need to fill in a little more text.
+    """,
+        ),
+    )
+
+
+def load(modelname):
+    with open(modelname, "rb") as writer:
+        data = pickle.load(writer)
+    return data
+
+
+def save_model(model, idx_to_name, num_words, modelname):
+    out = {
+        "model": model,
+        "idx_to_name": idx_to_name,
+        "num_words": num_words,
+    }
+    with open(modelname, "wb") as writer:
+        pickle.dump(out, writer)
+
+
+def inference_voter(model, text):
+    tmp = text.split()
+    # print(len(tmp), tmp)
+    tmp = [" ".join(tmp[i : i + model["num_words"]]) for i in range(0, len(tmp) - model["num_words"])]
+    predictions = model["model"].predict(tmp)
+    # print("integer predictions", predictions)
+    # print("name predictions", *[model["idx_to_name"][n] for n in predictions])
+    result = Counter(predictions).most_common(1)[0][0]
+    return model["idx_to_name"][result]
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-m", "--model", help="save location for model and metadata")
+    parser.add_argument("-d", "--data", help="specify the folder for data files")
+    parser.add_argument("-n", "--num_words", help="number of words to use for statistics", type=int)
+    args = parser.parse_args()
+    # np.set_printoptions(threshold=np.inf)
+    main(args.data, args.model, args.num_words)