browsermt · mfomicheva · Mar 2, 2022 · Feb 18, 2022 · Feb 28, 2022
diff --git a/README.md b/README.md
@@ -10,3 +10,15 @@ Once conda env is created, activate it by running `conda activate qe`. After tha
 to install the package.
 
 Once the installation is completed, you can run `scripts/run.sh` to produce the models.
+
+## Models
+
+The quality models available are:
+
+- [English-Czech](model/csen/encs.quality.lr/)
+- [English-Estonian](model/eten/enet.quality.lr/)
+- [English-Spanish](model/esen/enes.quality.lr/)
+
+## Tools
+
+- [Convert json quality model file to a binary file and vice versa](tools/lr/)
diff --git a/model/csen/encs.quality.lr/README.md b/model/csen/encs.quality.lr/README.md
@@ -0,0 +1,5 @@
+# English-Czech Quality Model
+
+## Generating model file
+
+- To generate this binary file, we user the following [script tool](../../../tools/lr)
diff --git a/model/csen/encs.quality.lr/model.encs.json b/model/csen/encs.quality.lr/model.encs.json
@@ -0,0 +1,21 @@
+{
+    "mean_": [
+        -0.24797849288781534,
+        -0.4101926409017722,
+        2.2572463768115942,
+        -0.17062557101632025
+    ],
+    "scale_": [
+        0.30622666546430516,
+        0.46463165469978934,
+        1.2531152108122379,
+        0.07587893685108503
+    ],
+    "coef_": [
+        -0.08352394676472537,
+        -0.7595593200892585,
+        0.5742433796116994,
+        -0.41813250641346145
+    ],
+    "intercept_": 0.07783525719972424
+}
diff --git a/model/csen/encs.quality.lr/quality_model.bin b/model/csen/encs.quality.lr/quality_model.bin
diff --git a/model/esen/enes.quality.lr/README.md b/model/esen/enes.quality.lr/README.md
@@ -0,0 +1,5 @@
+# English-Spanish Quality Model
+
+## Generating model file
+
+- To generate this binary file, we user the following [script tool](../../../tools/lr)
diff --git a/model/esen/enes.quality.lr/model.enes.json b/model/esen/enes.quality.lr/model.enes.json
@@ -0,0 +1 @@
+{"mean_": [-0.1683024833460827, -0.2219354016064257, 1.5856760374832664, -0.09506267842890267], "scale_": [0.26767448702069596, 0.3384959372122154, 0.9846660586834864, 0.0596166611041935], "coef_": [-0.0915905546735799, -0.9650785415892639, 0.47999183999258094, -0.4956439542954627], "intercept_": 0.16704303096891654}
diff --git a/model/esen/enes.quality.lr/quality_model.bin b/model/esen/enes.quality.lr/quality_model.bin
diff --git a/model/eten/enet.quality.lr/README.md b/model/eten/enet.quality.lr/README.md
@@ -0,0 +1,5 @@
+# English-Estonian Quality Model
+
+## Generating model file
+
+- To generate this binary file, we user the following [script tool](../../../tools/lr)
diff --git a/model/eten/enet.quality.lr/model.enet.json b/model/eten/enet.quality.lr/model.enet.json
@@ -0,0 +1,21 @@
+{
+    "mean_": [
+        -0.23515637219436597,
+        -0.41058617943548387,
+        2.546706989247312,
+        -0.18946239170117798
+    ],
+    "scale_": [
+        0.29801481765167837,
+        0.4796714344066216,
+        1.5305477848377145,
+        0.103550684449697
+    ],
+    "coef_": [
+        0.12835516184041088,
+        -1.1683037932196483,
+        0.648637158173214,
+        -0.4848115618235599
+    ],
+    "intercept_": 0.15527772370082504
+}
diff --git a/model/eten/enet.quality.lr/quality_model.bin b/model/eten/enet.quality.lr/quality_model.bin
diff --git a/tools/lr/README.md b/tools/lr/README.md
@@ -0,0 +1,27 @@
+# Quality Model Tool
+
+- The python script ```qualityestimator_json_to_bin.py``` converts a logistic regressor quality estimator model from json to binary file and vice versa.
+
+- To converts a json to binary:
+
+```console
+  python qualityestimator_json_to_bin.py --to_json qe_model.json --out qe_model.bin
+```
+
+- To converts a binary to json:
+
+```console
+  python qualityestimator_json_to_bin.py --from_json qe_model.bin --out qe_model.json
+```
+
+- The json must follow this structure:
+```json
+{
+    "mean_": [ 0.0, 0.0, 0.0, 0.0, ], 
+    "scale_": [ 0.0, 0.0, 0.0, 0.0, ],
+    "coef_": [ 0.0, 0.0, 0.0, 0.0, ], 
+    "intercept_": 0.0 
+}
+```
+
+- The binary file will have the following structure defined on [LogisticRegressorQualityEstimator](https://github.com/browsermt/bergamot-translator/blob/main/src/translator/quality_estimator.h#L100-L108).
diff --git a/tools/lr/qualityestimator_json_to_bin.py b/tools/lr/qualityestimator_json_to_bin.py
@@ -0,0 +1,91 @@
+import argparse
+import json
+import struct
+from collections import namedtuple
+
+# magic(uint64_t), lrParametersDims(uint64_t)
+Header_fmt = "<1Q1Q"
+Header_len = struct.calcsize(Header_fmt)
+
+QE_MAGIC_NUMBER = 8704388732126802304
+
+
+def from_qe_file(file):
+    magic, paramDim = struct.unpack(Header_fmt, file.read(Header_len))
+
+    if magic != QE_MAGIC_NUMBER:
+        print("Invalid quality estimator file.")
+        exit(1)
+
+    # scale_[N] + mean_[N] + coef_[N] + intercept_
+    lrParams_fmt = f"<{3*paramDim+1}f"
+
+    lrParams_size = struct.calcsize(lrParams_fmt)
+
+    params = list(struct.unpack(lrParams_fmt, file.read(lrParams_size)))
+
+    lrParams = {}
+    lrParams["scale_"] = params[:paramDim]
+    lrParams["mean_"] = params[paramDim : 2 * paramDim]
+    lrParams["coef_"] = params[2 * paramDim : 3 * paramDim]
+    lrParams["intercept_"] = params[3 * paramDim]
+
+    return lrParams
+
+
+def to_binary(lrParams):
+
+    paramDims = len(lrParams["scale_"])
+
+    if paramDims != len(lrParams["mean_"]) and paramDims != len(
+        lrParams["coef_"]
+    ):
+        print("Invalid LR parameters.")
+        exit(1)
+
+    lrParams_fmt = f"<{3*paramDims+1}f"
+
+    params = (
+        lrParams["scale_"]
+        + lrParams["mean_"]
+        + lrParams["coef_"]
+        + [lrParams["intercept_"]]
+    )
+
+    return struct.pack(Header_fmt, QE_MAGIC_NUMBER, paramDims) + struct.pack(
+        lrParams_fmt, *params
+    )
+
+
+parser = argparse.ArgumentParser(description="Read and write quality estimator files.")
+parser.add_argument(
+    "--to_json", type=argparse.FileType("rb"), help="Read quality estimator file"
+)
+parser.add_argument(
+    "--from_json",
+    type=argparse.FileType("r"),
+    help="Read json file and generate quality estimator binary",
+)
+parser.add_argument(
+    "--out",
+    type=argparse.FileType("wb"),
+    help="Output generated data from to_json or from_json option",
+)
+
+args = parser.parse_args()
+
+output = None
+
+if args.to_json:
+    output = json.dumps(from_qe_file(args.to_json), indent=3)
+elif args.from_json:
+    output = to_binary(json.loads(args.from_json.read()))
+
+if output is None:
+    exit(0)
+
+if args.out:
+    args.out.write(output.encode("UTF-8") if type(output) is str else output)
+    args.out.close()
+else:
+    print(output)