From 994dce7c766db0723488d98224a71ef39665be3d Mon Sep 17 00:00:00 2001 From: MattAlexMiracle Date: Sat, 28 Jan 2023 00:56:56 +0100 Subject: [PATCH] language classifier (#963) Co-authored-by: Alexander Mattick --- .../utils/language_classification.py | 111 ++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 backend/oasst_backend/utils/language_classification.py diff --git a/backend/oasst_backend/utils/language_classification.py b/backend/oasst_backend/utils/language_classification.py new file mode 100644 index 00000000..ec75b9c3 --- /dev/null +++ b/backend/oasst_backend/utils/language_classification.py @@ -0,0 +1,111 @@ +import os +import pickle +from collections import Counter + +from sklearn import metrics +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline +from sklearn.svm import LinearSVC + + +def load_and_split(foldername, num_words): + ls = os.listdir(foldername) + X = [] + Y = [] + langmap = dict() + for idx, x in enumerate(ls): + print("loading language", x) + with open(foldername + "/" + x, "r") as reader: + tmp = reader.read().split(" ") + tmp = [" ".join(tmp[i : i + num_words]) for i in range(0, 100_000, num_words)] + X.extend(tmp) + Y.extend([idx] * len(tmp)) + langmap[idx] = x + x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.90) + return x_train, x_test, y_train, y_test, langmap + + +def build_and_train_pipeline(x_train, y_train): + vectorizer = TfidfVectorizer(ngram_range=(1, 2), analyzer="char", use_idf=False) + clf = Pipeline( + [ + ("vec", vectorizer), + # ("nystrom", Nystroem(n_components=1000,n_jobs=6)), + ("clf", LinearSVC(C=0.5)), + # ("clf",GaussianNB()) + # ("clf", HistGradientBoostingClassifier()) + ] + ) + print("fitting model...") + clf.fit(x_train, y_train) + return clf + + +def benchmark(clf, x_test, y_test, langmap): + print("benchmarking model...") + y_pred = clf.predict(x_test) + names = list(langmap.values()) + # print(y_test) + # print(langmap) + print(metrics.classification_report(y_test, y_pred, target_names=names)) + cm = metrics.confusion_matrix(y_test, y_pred) + print(cm) + + +def main(foldername, modelname, num_words): + x_train, x_test, y_train, y_test, langmap = load_and_split(foldername=foldername, num_words=num_words) + clf = build_and_train_pipeline(x_train, y_train) + benchmark(clf, x_test, y_test, langmap) + save_model(clf, langmap, num_words, modelname) + model = load(modelname) + print( + "running infernence on long tests", + inference_voter( + model, + """ + What language is this text written in? Nobody knows until you fill in at least ten words. + This test here is to check whether the moving window approach works, + so I still need to fill in a little more text. + """, + ), + ) + + +def load(modelname): + with open(modelname, "rb") as writer: + data = pickle.load(writer) + return data + + +def save_model(model, idx_to_name, num_words, modelname): + out = { + "model": model, + "idx_to_name": idx_to_name, + "num_words": num_words, + } + with open(modelname, "wb") as writer: + pickle.dump(out, writer) + + +def inference_voter(model, text): + tmp = text.split() + # print(len(tmp), tmp) + tmp = [" ".join(tmp[i : i + model["num_words"]]) for i in range(0, len(tmp) - model["num_words"])] + predictions = model["model"].predict(tmp) + # print("integer predictions", predictions) + # print("name predictions", *[model["idx_to_name"][n] for n in predictions]) + result = Counter(predictions).most_common(1)[0][0] + return model["idx_to_name"][result] + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("-m", "--model", help="save location for model and metadata") + parser.add_argument("-d", "--data", help="specify the folder for data files") + parser.add_argument("-n", "--num_words", help="number of words to use for statistics", type=int) + args = parser.parse_args() + # np.set_printoptions(threshold=np.inf) + main(args.data, args.model, args.num_words)