From a6135336f5e2ec66fe95b2dc7a9a54cfb29167ac Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Fri, 27 Oct 2017 17:02:55 +0200
Subject: [PATCH] Tidy up gold

---
 spacy/gold.pyx | 79 +++++++++++++++++++++++++++-----------------------
 1 file changed, 43 insertions(+), 36 deletions(-)

diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 5729af667..921c837ba 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -54,7 +54,8 @@ def merge_sents(sents):
         m_deps[3].extend(head + i for head in heads)
         m_deps[4].extend(labels)
         m_deps[5].extend(ner)
-        m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
+        m_brackets.extend((b['first'] + i, b['last'] + i, b['label'])
+                          for b in brackets)
         i += len(ids)
     return [(m_deps, m_brackets)]
 
@@ -80,6 +81,8 @@ def align(cand_words, gold_words):
 
 
 punct_re = re.compile(r'\W')
+
+
 def _min_edit_path(cand_words, gold_words):
     cdef:
         Pool mem
@@ -98,9 +101,9 @@ def _min_edit_path(cand_words, gold_words):
     mem = Pool()
     n_cand = len(cand_words)
     n_gold = len(gold_words)
-    # Levenshtein distance, except we need the history, and we may want different
-    # costs.
-    # Mark operations with a string, and score the history using _edit_cost.
+    # Levenshtein distance, except we need the history, and we may want
+    # different costs. Mark operations with a string, and score the history
+    # using _edit_cost.
     previous_row = []
     prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
     curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
@@ -144,9 +147,9 @@ def _min_edit_path(cand_words, gold_words):
 
 
 def minibatch(items, size=8):
-    '''Iterate over batches of items. `size` may be an iterator,
+    """Iterate over batches of items. `size` may be an iterator,
     so that batch-size can vary on each step.
-    '''
+    """
     if isinstance(size, int):
         size_ = itertools.repeat(8)
     else:
@@ -168,6 +171,7 @@ class GoldCorpus(object):
 
         train_path (unicode or Path): File or directory of training data.
         dev_path (unicode or Path): File or directory of development data.
+        RETURNS (GoldCorpus): The newly created object.
         """
         self.train_path = util.ensure_path(train_path)
         self.dev_path = util.ensure_path(dev_path)
@@ -213,7 +217,7 @@ class GoldCorpus(object):
         train_tuples = self.train_tuples
         if projectivize:
             train_tuples = nonproj.preprocess_training_data(
-                               self.train_tuples, label_freq_cutoff=100)
+                self.train_tuples, label_freq_cutoff=100)
         random.shuffle(train_tuples)
         gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
                                         max_length=max_length,
@@ -222,7 +226,6 @@ class GoldCorpus(object):
 
     def dev_docs(self, nlp, gold_preproc=False):
         gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
-        #gold_docs = nlp.preprocess_gold(gold_docs)
         yield from gold_docs
 
     @classmethod
@@ -233,7 +236,6 @@ class GoldCorpus(object):
                 raw_text = None
             else:
                 paragraph_tuples = merge_sents(paragraph_tuples)
-
             docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
                                   gold_preproc, noise_level=noise_level)
             golds = cls._make_golds(docs, paragraph_tuples)
@@ -248,17 +250,20 @@ class GoldCorpus(object):
             raw_text = add_noise(raw_text, noise_level)
             return [nlp.make_doc(raw_text)]
         else:
-            return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
-                for (sent_tuples, brackets) in paragraph_tuples]
+            return [Doc(nlp.vocab,
+                        words=add_noise(sent_tuples[1], noise_level))
+                    for (sent_tuples, brackets) in paragraph_tuples]
 
     @classmethod
     def _make_golds(cls, docs, paragraph_tuples):
         assert len(docs) == len(paragraph_tuples)
         if len(docs) == 1:
-            return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0])]
+            return [GoldParse.from_annot_tuples(docs[0],
+                                                paragraph_tuples[0][0])]
         else:
             return [GoldParse.from_annot_tuples(doc, sent_tuples)
-                    for doc, (sent_tuples, brackets) in zip(docs, paragraph_tuples)]
+                    for doc, (sent_tuples, brackets)
+                    in zip(docs, paragraph_tuples)]
 
     @staticmethod
     def walk_corpus(path):
@@ -330,16 +335,16 @@ def read_json_file(loc, docs_filter=None, limit=None):
                     for i, token in enumerate(sent['tokens']):
                         words.append(token['orth'])
                         ids.append(i)
-                        tags.append(token.get('tag','-'))
-                        heads.append(token.get('head',0) + i)
-                        labels.append(token.get('dep',''))
+                        tags.append(token.get('tag', '-'))
+                        heads.append(token.get('head', 0) + i)
+                        labels.append(token.get('dep', ''))
                         # Ensure ROOT label is case-insensitive
                         if labels[-1].lower() == 'root':
                             labels[-1] = 'ROOT'
                         ner.append(token.get('ner', '-'))
                     sents.append([
                         [ids, words, tags, heads, labels, ner],
-                         sent.get('brackets', [])])
+                        sent.get('brackets', [])])
                 if sents:
                     yield [paragraph.get('raw', None), sents]
 
@@ -382,19 +387,21 @@ cdef class GoldParse:
     @classmethod
     def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):
         _, words, tags, heads, deps, entities = annot_tuples
-        return cls(doc, words=words, tags=tags, heads=heads, deps=deps, entities=entities,
-                   make_projective=make_projective)
+        return cls(doc, words=words, tags=tags, heads=heads, deps=deps,
+                   entities=entities, make_projective=make_projective)
 
-    def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
-                 deps=None, entities=None, make_projective=False,
+    def __init__(self, doc, annot_tuples=None, words=None, tags=None,
+                 heads=None, deps=None, entities=None, make_projective=False,
                  cats=None):
         """Create a GoldParse.
 
         doc (Doc): The document the annotations refer to.
         words (iterable): A sequence of unicode word strings.
         tags (iterable): A sequence of strings, representing tag annotations.
-        heads (iterable): A sequence of integers, representing syntactic head offsets.
-        deps (iterable): A sequence of strings, representing the syntactic relation types.
+        heads (iterable): A sequence of integers, representing syntactic
+            head offsets.
+        deps (iterable): A sequence of strings, representing the syntactic
+            relation types.
         entities (iterable): A sequence of named entity annotations, either as
             BILUO tag strings, or as `(start_char, end_char, label)` tuples,
             representing the entity positions.
@@ -404,9 +411,10 @@ cdef class GoldParse:
             document (usually a sentence). Unlike entity annotations, label
             annotations can overlap, i.e. a single word can be covered by
             multiple labelled spans. The TextCategorizer component expects
-            true examples of a label to have the value 1.0, and negative examples
-            of a label to have the value 0.0. Labels not in the dictionary are
-            treated as missing -- the gradient for those labels will be zero.
+            true examples of a label to have the value 1.0, and negative
+            examples of a label to have the value 0.0. Labels not in the
+            dictionary are treated as missing - the gradient for those labels
+            will be zero.
         RETURNS (GoldParse): The newly constructed object.
         """
         if words is None:
@@ -470,11 +478,11 @@ cdef class GoldParse:
                 self.ner[i] = entities[gold_i]
 
         cycle = nonproj.contains_cycle(self.heads)
-        if cycle != None:
+        if cycle is not None:
             raise Exception("Cycle found: %s" % cycle)
 
         if make_projective:
-            proj_heads,_ = nonproj.projectivize(self.heads, self.labels)
+            proj_heads, _ = nonproj.projectivize(self.heads, self.labels)
             self.heads = proj_heads
 
     def __len__(self):
@@ -497,20 +505,19 @@ cdef class GoldParse:
 
 
 def biluo_tags_from_offsets(doc, entities, missing='O'):
-    """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
-    scheme (BILUO).
+    """Encode labelled spans into per-token tags, using the
+    Begin/In/Last/Unit/Out scheme (BILUO).
 
     doc (Doc): The document that the entity offsets refer to. The output tags
         will refer to the token boundaries within the document.
-    entities (iterable): A sequence of `(start, end, label)` triples. `start` and
-        `end` should be character-offset integers denoting the slice into the
-        original string.
-
+    entities (iterable): A sequence of `(start, end, label)` triples. `start`
+        and `end` should be character-offset integers denoting the slice into
+        the original string.
     RETURNS (list): A list of unicode strings, describing the tags. Each tag
         string will be of the form either "", "O" or "{action}-{label}", where
         action is one of "B", "I", "L", "U". The string "-" is used where the
-        entity offsets don't align with the tokenization in the `Doc` object. The
-        training algorithm will view these as missing values. "O" denotes a
+        entity offsets don't align with the tokenization in the `Doc` object.
+        The training algorithm will view these as missing values. "O" denotes a
         non-entity token. "B" denotes the beginning of a multi-token entity,
         "I" the inside of an entity of three or more tokens, and "L" the end
         of an entity of two or more tokens. "U" denotes a single-token entity.