From a6135336f5e2ec66fe95b2dc7a9a54cfb29167ac Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 27 Oct 2017 17:02:55 +0200 Subject: [PATCH] Tidy up gold --- spacy/gold.pyx | 79 +++++++++++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 36 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 5729af667..921c837ba 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -54,7 +54,8 @@ def merge_sents(sents): m_deps[3].extend(head + i for head in heads) m_deps[4].extend(labels) m_deps[5].extend(ner) - m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets) + m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) + for b in brackets) i += len(ids) return [(m_deps, m_brackets)] @@ -80,6 +81,8 @@ def align(cand_words, gold_words): punct_re = re.compile(r'\W') + + def _min_edit_path(cand_words, gold_words): cdef: Pool mem @@ -98,9 +101,9 @@ def _min_edit_path(cand_words, gold_words): mem = Pool() n_cand = len(cand_words) n_gold = len(gold_words) - # Levenshtein distance, except we need the history, and we may want different - # costs. - # Mark operations with a string, and score the history using _edit_cost. + # Levenshtein distance, except we need the history, and we may want + # different costs. Mark operations with a string, and score the history + # using _edit_cost. previous_row = [] prev_costs = mem.alloc(n_gold + 1, sizeof(int)) curr_costs = mem.alloc(n_gold + 1, sizeof(int)) @@ -144,9 +147,9 @@ def _min_edit_path(cand_words, gold_words): def minibatch(items, size=8): - '''Iterate over batches of items. `size` may be an iterator, + """Iterate over batches of items. `size` may be an iterator, so that batch-size can vary on each step. - ''' + """ if isinstance(size, int): size_ = itertools.repeat(8) else: @@ -168,6 +171,7 @@ class GoldCorpus(object): train_path (unicode or Path): File or directory of training data. dev_path (unicode or Path): File or directory of development data. + RETURNS (GoldCorpus): The newly created object. """ self.train_path = util.ensure_path(train_path) self.dev_path = util.ensure_path(dev_path) @@ -213,7 +217,7 @@ class GoldCorpus(object): train_tuples = self.train_tuples if projectivize: train_tuples = nonproj.preprocess_training_data( - self.train_tuples, label_freq_cutoff=100) + self.train_tuples, label_freq_cutoff=100) random.shuffle(train_tuples) gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc, max_length=max_length, @@ -222,7 +226,6 @@ class GoldCorpus(object): def dev_docs(self, nlp, gold_preproc=False): gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc) - #gold_docs = nlp.preprocess_gold(gold_docs) yield from gold_docs @classmethod @@ -233,7 +236,6 @@ class GoldCorpus(object): raw_text = None else: paragraph_tuples = merge_sents(paragraph_tuples) - docs = cls._make_docs(nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=noise_level) golds = cls._make_golds(docs, paragraph_tuples) @@ -248,17 +250,20 @@ class GoldCorpus(object): raw_text = add_noise(raw_text, noise_level) return [nlp.make_doc(raw_text)] else: - return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level)) - for (sent_tuples, brackets) in paragraph_tuples] + return [Doc(nlp.vocab, + words=add_noise(sent_tuples[1], noise_level)) + for (sent_tuples, brackets) in paragraph_tuples] @classmethod def _make_golds(cls, docs, paragraph_tuples): assert len(docs) == len(paragraph_tuples) if len(docs) == 1: - return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0])] + return [GoldParse.from_annot_tuples(docs[0], + paragraph_tuples[0][0])] else: return [GoldParse.from_annot_tuples(doc, sent_tuples) - for doc, (sent_tuples, brackets) in zip(docs, paragraph_tuples)] + for doc, (sent_tuples, brackets) + in zip(docs, paragraph_tuples)] @staticmethod def walk_corpus(path): @@ -330,16 +335,16 @@ def read_json_file(loc, docs_filter=None, limit=None): for i, token in enumerate(sent['tokens']): words.append(token['orth']) ids.append(i) - tags.append(token.get('tag','-')) - heads.append(token.get('head',0) + i) - labels.append(token.get('dep','')) + tags.append(token.get('tag', '-')) + heads.append(token.get('head', 0) + i) + labels.append(token.get('dep', '')) # Ensure ROOT label is case-insensitive if labels[-1].lower() == 'root': labels[-1] = 'ROOT' ner.append(token.get('ner', '-')) sents.append([ [ids, words, tags, heads, labels, ner], - sent.get('brackets', [])]) + sent.get('brackets', [])]) if sents: yield [paragraph.get('raw', None), sents] @@ -382,19 +387,21 @@ cdef class GoldParse: @classmethod def from_annot_tuples(cls, doc, annot_tuples, make_projective=False): _, words, tags, heads, deps, entities = annot_tuples - return cls(doc, words=words, tags=tags, heads=heads, deps=deps, entities=entities, - make_projective=make_projective) + return cls(doc, words=words, tags=tags, heads=heads, deps=deps, + entities=entities, make_projective=make_projective) - def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None, - deps=None, entities=None, make_projective=False, + def __init__(self, doc, annot_tuples=None, words=None, tags=None, + heads=None, deps=None, entities=None, make_projective=False, cats=None): """Create a GoldParse. doc (Doc): The document the annotations refer to. words (iterable): A sequence of unicode word strings. tags (iterable): A sequence of strings, representing tag annotations. - heads (iterable): A sequence of integers, representing syntactic head offsets. - deps (iterable): A sequence of strings, representing the syntactic relation types. + heads (iterable): A sequence of integers, representing syntactic + head offsets. + deps (iterable): A sequence of strings, representing the syntactic + relation types. entities (iterable): A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. @@ -404,9 +411,10 @@ cdef class GoldParse: document (usually a sentence). Unlike entity annotations, label annotations can overlap, i.e. a single word can be covered by multiple labelled spans. The TextCategorizer component expects - true examples of a label to have the value 1.0, and negative examples - of a label to have the value 0.0. Labels not in the dictionary are - treated as missing -- the gradient for those labels will be zero. + true examples of a label to have the value 1.0, and negative + examples of a label to have the value 0.0. Labels not in the + dictionary are treated as missing - the gradient for those labels + will be zero. RETURNS (GoldParse): The newly constructed object. """ if words is None: @@ -470,11 +478,11 @@ cdef class GoldParse: self.ner[i] = entities[gold_i] cycle = nonproj.contains_cycle(self.heads) - if cycle != None: + if cycle is not None: raise Exception("Cycle found: %s" % cycle) if make_projective: - proj_heads,_ = nonproj.projectivize(self.heads, self.labels) + proj_heads, _ = nonproj.projectivize(self.heads, self.labels) self.heads = proj_heads def __len__(self): @@ -497,20 +505,19 @@ cdef class GoldParse: def biluo_tags_from_offsets(doc, entities, missing='O'): - """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out - scheme (BILUO). + """Encode labelled spans into per-token tags, using the + Begin/In/Last/Unit/Out scheme (BILUO). doc (Doc): The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. - entities (iterable): A sequence of `(start, end, label)` triples. `start` and - `end` should be character-offset integers denoting the slice into the - original string. - + entities (iterable): A sequence of `(start, end, label)` triples. `start` + and `end` should be character-offset integers denoting the slice into + the original string. RETURNS (list): A list of unicode strings, describing the tags. Each tag string will be of the form either "", "O" or "{action}-{label}", where action is one of "B", "I", "L", "U". The string "-" is used where the - entity offsets don't align with the tokenization in the `Doc` object. The - training algorithm will view these as missing values. "O" denotes a + entity offsets don't align with the tokenization in the `Doc` object. + The training algorithm will view these as missing values. "O" denotes a non-entity token. "B" denotes the beginning of a multi-token entity, "I" the inside of an entity of three or more tokens, and "L" the end of an entity of two or more tokens. "U" denotes a single-token entity.