diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 7d8e44f79..45b95b379 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -173,12 +173,11 @@ class GoldCorpus(object): if shuffle: random.shuffle(self.train_locs) if projectivize: - train_tuples = nonproj.PseudoProjectivity.preprocess_training_data( + train_tuples = nonproj.preprocess_training_data( self.train_tuples) - gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc) if shuffle: - gold_docs = util.itershuffle(gold_docs, bufsize=shuffle*1000) - gold_docs = nlp.preprocess_gold(gold_docs) + random.shuffle(train_tuples) + gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc) yield from gold_docs def dev_docs(self, nlp): @@ -236,7 +235,7 @@ class GoldCorpus(object): return locs -def read_json_file(loc, docs_filter=None, limit=1000): +def read_json_file(loc, docs_filter=None, limit=None): loc = ensure_path(loc) if loc.is_dir(): for filename in loc.iterdir(): @@ -390,7 +389,7 @@ cdef class GoldParse: raise Exception("Cycle found: %s" % cycle) if make_projective: - proj_heads,_ = nonproj.PseudoProjectivity.projectivize(self.heads, self.labels) + proj_heads,_ = nonproj.projectivize(self.heads, self.labels) self.heads = proj_heads def __len__(self): diff --git a/spacy/language.py b/spacy/language.py index 0f38252f7..475797ee2 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -13,7 +13,7 @@ from .vocab import Vocab from .tagger import Tagger from .lemmatizer import Lemmatizer from .syntax.parser import get_templates -from .syntax.nonproj import PseudoProjectivity +from .syntax.import nonproj from .pipeline import NeuralDependencyParser, EntityRecognizer from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer from .pipeline import NeuralLabeller @@ -97,7 +97,7 @@ class BaseDefaults(object): 'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)], 'dependencies': lambda nlp, **cfg: [ NeuralDependencyParser(nlp.vocab, **cfg), - PseudoProjectivity.deprojectivize], + nonproj.deprojectivize], 'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)], } diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 6cd2fea95..81e44e84b 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -47,7 +47,7 @@ from ._parse_features cimport CONTEXT_SIZE from ._parse_features cimport fill_context from .stateclass cimport StateClass from ._state cimport StateC -from .nonproj import PseudoProjectivity +from . import nonproj from .transition_system import OracleError from .transition_system cimport TransitionSystem, Transition from ..structs cimport TokenC @@ -435,7 +435,7 @@ cdef class Parser: def begin_training(self, gold_tuples, **cfg): if 'model' in cfg: self.model = cfg['model'] - gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples) + gold_tuples = nonproj.preprocess_training_data(gold_tuples) actions = self.moves.get_actions(gold_parses=gold_tuples) for action, labels in actions.items(): for label in labels: diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx index b966a826e..880235440 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/syntax/nonproj.pyx @@ -1,10 +1,17 @@ # coding: utf-8 +""" +Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005 +for doing pseudo-projective parsing implementation uses the HEAD decoration +scheme. +""" from __future__ import unicode_literals from copy import copy from ..tokens.doc cimport Doc from ..attrs import DEP, HEAD +DELIMITER = '||' + def ancestors(tokenid, heads): # returns all words going from the word up the path to the root @@ -60,139 +67,126 @@ def is_nonproj_tree(heads): return any( is_nonproj_arc(word,heads) for word in range(len(heads)) ) -class PseudoProjectivity: - # implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005 - # for doing pseudo-projective parsing - # implementation uses the HEAD decoration scheme - - delimiter = '||' - - @classmethod - def decompose(cls, label): - return label.partition(cls.delimiter)[::2] - - @classmethod - def is_decorated(cls, label): - return label.find(cls.delimiter) != -1 - - @classmethod - def preprocess_training_data(cls, gold_tuples, label_freq_cutoff=30): - preprocessed = [] - freqs = {} - for raw_text, sents in gold_tuples: - prepro_sents = [] - for (ids, words, tags, heads, labels, iob), ctnts in sents: - proj_heads,deco_labels = cls.projectivize(heads,labels) - # set the label to ROOT for each root dependent - deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ] - # count label frequencies - if label_freq_cutoff > 0: - for label in deco_labels: - if cls.is_decorated(label): - freqs[label] = freqs.get(label,0) + 1 - prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts)) - preprocessed.append((raw_text, prepro_sents)) - - if label_freq_cutoff > 0: - return cls._filter_labels(preprocessed,label_freq_cutoff,freqs) - return preprocessed +def decompose(label): + return label.partition(DELIMITER)[::2] - @classmethod - def projectivize(cls, heads, labels): - # use the algorithm by Nivre & Nilsson 2005 - # assumes heads to be a proper tree, i.e. connected and cycle-free - # returns a new pair (heads,labels) which encode - # a projective and decorated tree - proj_heads = copy(heads) - smallest_np_arc = cls._get_smallest_nonproj_arc(proj_heads) - if smallest_np_arc == None: # this sentence is already projective - return proj_heads, copy(labels) - while smallest_np_arc != None: - cls._lift(smallest_np_arc, proj_heads) - smallest_np_arc = cls._get_smallest_nonproj_arc(proj_heads) - deco_labels = cls._decorate(heads, proj_heads, labels) - return proj_heads, deco_labels +def is_decorated(label): + return label.find(DELIMITER) != -1 - @classmethod - def deprojectivize(cls, tokens): - # reattach arcs with decorated labels (following HEAD scheme) - # for each decorated arc X||Y, search top-down, left-to-right, - # breadth-first until hitting a Y then make this the new head - for token in tokens: - if cls.is_decorated(token.dep_): - newlabel,headlabel = cls.decompose(token.dep_) - newhead = cls._find_new_head(token,headlabel) - token.head = newhead - token.dep_ = newlabel - return tokens +def preprocess_training_data(gold_tuples, label_freq_cutoff=30): + preprocessed = [] + freqs = {} + for raw_text, sents in gold_tuples: + prepro_sents = [] + for (ids, words, tags, heads, labels, iob), ctnts in sents: + proj_heads,deco_labels = projectivize(heads,labels) + # set the label to ROOT for each root dependent + deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ] + # count label frequencies + if label_freq_cutoff > 0: + for label in deco_labels: + if is_decorated(label): + freqs[label] = freqs.get(label,0) + 1 + prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts)) + preprocessed.append((raw_text, prepro_sents)) - @classmethod - def _decorate(cls, heads, proj_heads, labels): - # uses decoration scheme HEAD from Nivre & Nilsson 2005 - assert(len(heads) == len(proj_heads) == len(labels)) - deco_labels = [] - for tokenid,head in enumerate(heads): - if head != proj_heads[tokenid]: - deco_labels.append('%s%s%s' % (labels[tokenid],cls.delimiter,labels[head])) - else: - deco_labels.append(labels[tokenid]) - return deco_labels + if label_freq_cutoff > 0: + return _filter_labels(preprocessed,label_freq_cutoff,freqs) + return preprocessed - @classmethod - def _get_smallest_nonproj_arc(cls, heads): - # return the smallest non-proj arc or None - # where size is defined as the distance between dep and head - # and ties are broken left to right - smallest_size = float('inf') - smallest_np_arc = None - for tokenid,head in enumerate(heads): - size = abs(tokenid-head) - if size < smallest_size and is_nonproj_arc(tokenid,heads): - smallest_size = size - smallest_np_arc = tokenid - return smallest_np_arc +@classmethod +def projectivize(heads, labels): + # use the algorithm by Nivre & Nilsson 2005 + # assumes heads to be a proper tree, i.e. connected and cycle-free + # returns a new pair (heads,labels) which encode + # a projective and decorated tree + proj_heads = copy(heads) + smallest_np_arc = _get_smallest_nonproj_arc(proj_heads) + if smallest_np_arc == None: # this sentence is already projective + return proj_heads, copy(labels) + while smallest_np_arc != None: + _lift(smallest_np_arc, proj_heads) + smallest_np_arc = _get_smallest_nonproj_arc(proj_heads) + deco_labels = _decorate(heads, proj_heads, labels) + return proj_heads, deco_labels - @classmethod - def _lift(cls, tokenid, heads): - # reattaches a word to it's grandfather - head = heads[tokenid] - ghead = heads[head] - # attach to ghead if head isn't attached to root else attach to root - heads[tokenid] = ghead if head != ghead else tokenid +@classmethod +def deprojectivize(tokens): + # reattach arcs with decorated labels (following HEAD scheme) + # for each decorated arc X||Y, search top-down, left-to-right, + # breadth-first until hitting a Y then make this the new head + for token in tokens: + if is_decorated(token.dep_): + newlabel,headlabel = decompose(token.dep_) + newhead = _find_new_head(token,headlabel) + token.head = newhead + token.dep_ = newlabel + return tokens + +def _decorate(heads, proj_heads, labels): + # uses decoration scheme HEAD from Nivre & Nilsson 2005 + assert(len(heads) == len(proj_heads) == len(labels)) + deco_labels = [] + for tokenid,head in enumerate(heads): + if head != proj_heads[tokenid]: + deco_labels.append('%s%s%s' % (labels[tokenid], DELIMITER, labels[head])) + else: + deco_labels.append(labels[tokenid]) + return deco_labels - @classmethod - def _find_new_head(cls, token, headlabel): - # search through the tree starting from the head of the given token - # returns the id of the first descendant with the given label - # if there is none, return the current head (no change) - queue = [token.head] - while queue: - next_queue = [] - for qtoken in queue: - for child in qtoken.children: - if child.is_space: continue - if child == token: continue - if child.dep_ == headlabel: - return child - next_queue.append(child) - queue = next_queue - return token.head +def _get_smallest_nonproj_arc(heads): + # return the smallest non-proj arc or None + # where size is defined as the distance between dep and head + # and ties are broken left to right + smallest_size = float('inf') + smallest_np_arc = None + for tokenid,head in enumerate(heads): + size = abs(tokenid-head) + if size < smallest_size and is_nonproj_arc(tokenid,heads): + smallest_size = size + smallest_np_arc = tokenid + return smallest_np_arc - @classmethod - def _filter_labels(cls, gold_tuples, cutoff, freqs): - # throw away infrequent decorated labels - # can't learn them reliably anyway and keeps label set smaller - filtered = [] - for raw_text, sents in gold_tuples: - filtered_sents = [] - for (ids, words, tags, heads, labels, iob), ctnts in sents: - filtered_labels = [ cls.decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ] - filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts)) - filtered.append((raw_text, filtered_sents)) - return filtered +def _lift(tokenid, heads): + # reattaches a word to it's grandfather + head = heads[tokenid] + ghead = heads[head] + # attach to ghead if head isn't attached to root else attach to root + heads[tokenid] = ghead if head != ghead else tokenid + + +def _find_new_head(token, headlabel): + # search through the tree starting from the head of the given token + # returns the id of the first descendant with the given label + # if there is none, return the current head (no change) + queue = [token.head] + while queue: + next_queue = [] + for qtoken in queue: + for child in qtoken.children: + if child.is_space: continue + if child == token: continue + if child.dep_ == headlabel: + return child + next_queue.append(child) + queue = next_queue + return token.head + + +def _filter_labels(gold_tuples, cutoff, freqs): + # throw away infrequent decorated labels + # can't learn them reliably anyway and keeps label set smaller + filtered = [] + for raw_text, sents in gold_tuples: + filtered_sents = [] + for (ids, words, tags, heads, labels, iob), ctnts in sents: + filtered_labels = [ decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ] + filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts)) + filtered.append((raw_text, filtered_sents)) + return filtered