Introducing ROUGE: A full Python Implementation of the ROUGE Metric

We provide a useful and fast module for ROUGE scoring as well as a command to use it directly from the shell.
pltrdy · Mar 16, 2017 · 93d0f18 · 93d0f18
1 parent 55f2d66
commit 93d0f18
Show file tree

Hide file tree

Showing 15 changed files with 842 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+# swap files
+*.swp
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/README.md b/README.md
@@ -0,0 +1,123 @@
+# Rouge
+*A full Python librarie for the ROUGE metric [(paper)](http://www.aclweb.org/anthology/W04-1013).*
+
+## Quickstart
+#### Clone & Install
+```shell
+git clone https://github.com/pltrdy/pyrouge
+cd pyrouge
+sudo python3 setup.py install
+```
+or from pip:
+```
+sudo pip3 install rouge
+```
+#### Use it from the shell (JSON Output)
+```
+$rouge -h
+usage: rouge [-h] [-f] [-a] hypothesis reference
+
+Rouge Metric Calculator
+
+positional arguments:
+  hypothesis  Text of file path
+  reference   Text or file path
+
+optional arguments:
+  -h, --help  show this help message and exit
+  -f, --file  File mode
+  -a, --avg   Average mode
+
+```
+
+e.g. 
+
+
+```shell
+# Single Sentence
+rouge "transcript is a written version of each day 's cnn student" \
+      ""this page includes the show transcript use the transcript to help students with"
+
+# Scoring using two files (line by line)
+rouge -f ./tests/hyp.txt ./ref.txt
+```
+
+#### As a library
+
+###### Score 1 sentence
+
+```python
+from rouge import Rouge 
+
+hypothesis = "the #### transcript is a written version of each day 's cnn student news program use this transcript to he    lp students with reading comprehension and vocabulary use the weekly newsquiz to test your knowledge of storie s you     saw on cnn student news"
+
+reference = "this page includes the show transcript use the transcript to help students with reading comprehension and     vocabulary at the bottom of the page , comment for a chance to be mentioned on cnn student news . you must be a teac    her or a student age # # or older to request a mention on the cnn student news roll call . the weekly newsquiz tests     students ' knowledge of even ts in the news"
+
+rouge = Rouge()
+scores = rouge.get_scores()
+```
+
+*Output:*
+
+```json
+{
+  "rouge-1": {
+      "f": 0.5238095189484127,
+      "p": 0.6285714285714286,
+      "r": 0.4489795918367347
+    },  
+    "rouge-2": {
+      "f": 0.27027026566025497,
+      "p": 0.375,
+      "r": 0.2112676056338028
+    },  
+    "rouge-l": {
+      "f": 0.28711800978275975,
+      "p": 0.4418604651162791,
+      "r": 0.25675675675675674
+    }
+}  
+```
+
+###### Score multiple sentences
+```python
+import json
+from rouge import Rouge
+
+# Load some sentences
+with open('./tests/data.json') as f:
+  data = json.load(f)
+
+hyps, refs = map(list, zip(*[[d['hyp'], d['ref']] for d in data]))
+rouge = Rouge()
+scores = rouge.get_scores(hyps, refs)
+# or
+scores = rouge.get_scores(hyps, refs, avg=True)
+```
+
+*Output (`avg=False`)*: a list of `n` dicts:
+
+```
+{"rouge-1": {"f": _, "p": _, "r": _}, "rouge-2" : { .. }, "rouge-3": { ... }}
+```
+
+
+*Output (`avg=True`)*: a single dict with average values:
+
+```
+{"rouge-1": {"f": _, "p": _, "r": _}, "rouge-2" : { ..     }, "rouge-3": { ... }}
+``` 
+
+###### Score two files (line by line)
+Given two files `hyp_path`, `ref_path`, with the same number (`n`) of lines, calculate score for each of this lines, or, the average over the whole file. 
+
+```python
+from rouge import FilesRouge
+
+files_rouge = FilesRouge(hyp_path, ref_path)
+scores = files_rouge.get_scores()
+# or
+scores = files_rouge.get_scores(avg=True)
+```
+
+**Note** that you can avoid consuming too much memory by using `batch_line=l`. This way, the script will read only `l` lines at a time. (otherwise it loads the whole files). 
diff --git a/bin/.rouge_cmd.py.swp b/bin/.rouge_cmd.py.swp
diff --git a/bin/__init__.py b/bin/__init__.py
diff --git a/bin/rouge_cmd.py b/bin/rouge_cmd.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import os
+from rouge import Rouge, FilesRouge
+
+def main():
+  import argparse
+  parser = argparse.ArgumentParser(description='Rouge Metric Calculator')
+  parser.add_argument('-f', '--file', help="File mode", action='store_true')
+  parser.add_argument('-a', '--avg', help="Average mode", action='store_true')
+  parser.add_argument('hypothesis', type=str, help='Text of file path')
+  parser.add_argument('reference', type=str, help='Text or file path')
+
+  args = parser.parse_args()
+  if args.file:
+    hyp, ref = args.hypothesis, args.reference
+    assert(os.path.isfile(hyp))
+    assert(os.path.isfile(ref))
+
+    files_rouge = FilesRouge(hyp, ref)
+    scores = files_rouge.get_scores(avg=args.avg)
+
+    print(json.dumps(scores, indent=2))
+  else:
+    hyp, ref = args.hypothesis, args.reference
+    assert(type(hyp) == str)
+    assert(type(ref) == str)
+
+    rouge = Rouge()
+    scores = rouge.get_scores(hyp, ref, avg=args.avg)
+
+    print(json.dumps(scores, indent=2))
+
+if __name__ == "__main__":
+  main()
diff --git a/rouge/__init__.py b/rouge/__init__.py
@@ -0,0 +1,3 @@
+from rouge.rouge import FilesRouge, Rouge
+
+__version__ = "0.2"
diff --git a/rouge/rouge.py b/rouge/rouge.py
@@ -0,0 +1,135 @@
+# -*- coding: utf-8 -*-
+import rouge.rouge_score as rouge_score
+import os
+import numpy as np
+
+class FilesRouge:
+  def __init__(self, hyp_path, ref_path, metrics=None, stats=None, batch_lines=None):
+    assert(os.path.isfile(hyp_path))
+    assert(os.path.isfile(ref_path))
+
+    self.rouge = Rouge(metrics=metrics, stats=stats)
+
+    def line_count(path):
+      count = 0
+      for line in open(path):
+        count += 1
+      return count
+
+    hyp_lc = line_count(hyp_path)
+    ref_lc = line_count(ref_path)
+    assert(hyp_lc == ref_lc)
+
+    assert(batch_lines is None or type(batch_lines) == int)
+
+    self.hyp_path = hyp_path
+    self.ref_path = ref_path
+    self.batch_lines = batch_lines
+
+  def get_scores(self, avg=False):
+    """Calculate ROUGE scores between each pair of 
+    lines (hyp_file[i], ref_file[i]).
+    Args:
+      * hyp_path: hypothesis file path
+      * ref_path: references file path
+      * avg (False): whether to get an average scores or a list
+      * batch_line(None): set it to an integer value to work with
+        subsets of `batch_line` lines (uses less memory)
+    """
+    batch_lines = self.batch_lines
+    hyp_path, ref_path = self.hyp_path, self.ref_path
+
+    if batch_lines is None:
+      hyps = [line[:-1] for line in open(hyp_path).readlines()]
+      refs = [line[:-1] for line in open(ref_path).readlines()]
+
+
+      return self.rouge.get_scores(hyps, refs, avg=avg)
+
+    else:
+      if batch_lines > hyp_lc:
+        batch_lines = hyp_lc
+
+      if avg:
+        sc = [0, 0, 0]
+        update_scores = lambda s, h, r: [sum(x) for x in zip(s, self.rouge.get_scores(h, r, avg=True))]
+      else:
+        sc = []
+        update_scores = lambda s, h, r: s + self.rouge.get_scores(batch_hyp, batch_ref)
+
+      hyp_file = open(hyp_path)
+      ref_file = open(ref_path)
+
+      batch_hyp = []
+      batch_ref = []
+
+      for count in range(hyp_lc):
+        batch_hyp.append(hyp_file.readline()[:-1])
+        batch_ref.append(ref_file.readline()[:-1])
+
+        count += 1
+        if count == batch_lines:
+          sc = update_scores(sc, batch_hyp, batch_ref)
+          count = 0
+          batch_hyp = []
+          batch_ref = []
+
+      if avg:
+        return [s/hyp_lc for s in sc]
+      return sc
+
+
+class Rouge:
+  DEFAULT_METRICS = ["rouge-1", "rouge-2", "rouge-l"]
+  AVAILABLE_METRICS = {"rouge-1": lambda hyp, ref: rouge_score.rouge_n([hyp], [ref], 1), 
+                       "rouge-2": lambda hyp, ref: rouge_score.rouge_n([hyp], [ref], 2),
+                       "rouge-l": lambda hyp, ref: rouge_score.rouge_l_sentence_level([hyp], [ref]),
+                      }
+
+  DEFAULT_STATS = ["f", "p", "r"]
+  AVAILABLE_STATS = {"f": 0, "p": 1, "r": 2
+  }
+  def __init__(self, metrics=None, stats=None):
+    self.metrics = metrics if metrics is not None else Rouge.DEFAULT_METRICS
+    self.stats = stats if stats is not None else Rouge.DEFAULT_STATS
+
+    for m in self.metrics:
+      if m not in Rouge.AVAILABLE_METRICS:
+        raise ValueError("Unknown metric '%s'" % m)
+
+    for s in self.stats:
+      if s not in Rouge.AVAILABLE_STATS:
+        raise ValueError("Unknown stat '%s'" % s)
+
+  def get_scores(self, hyps, refs, avg=False):
+    if type(hyps) == str:
+      hyps, refs = [hyps], [refs]
+
+    assert(type(hyps) == type(refs))
+    assert(len(hyps) == len(refs))
+
+    if not avg:
+      return self._get_scores(hyps, refs)
+    return self._get_avg_scores(hyps, refs)
+
+  def _get_scores(self, hyps, refs):
+    scores = []
+    for hyp, ref in zip(hyps, refs):
+      sen_score = {}
+      for m in self.metrics:
+        fn = Rouge.AVAILABLE_METRICS[m] 
+        sc = fn(hyp, ref)
+        sen_score[m] = {s: sc[Rouge.AVAILABLE_STATS[s]] for s in self.stats}
+      scores.append(sen_score)
+    return scores
+
+  def _get_avg_scores(self, hyps, refs):
+    scores = {}
+    for m in self.metrics:
+      fn = Rouge.AVAILABLE_METRICS[m]
+      sc = [fn(hyp, ref) for hyp, ref in zip(hyps, refs)]
+      sc = [[sen_sc[Rouge.AVAILABLE_STATS[s]] for s in self.stats] for sen_sc in sc]
+      scores[m] = {s: st for s, st in zip(self.stats, tuple(map(np.mean, zip(*sc))))}
+    return scores
+
+