From 3448cb40a4daa5128a8590e06087d44dda25e046 Mon Sep 17 00:00:00 2001
From: Wolfgang Seeker <seeker@ims.uni-stuttgart.de>
Date: Tue, 1 Mar 2016 10:09:08 +0100
Subject: [PATCH] integrated pseudo-projective parsing into parser

- nonproj.pyx holds a class PseudoProjectivity which currently holds
  all functionality to implement Nivre & Nilsson 2005's pseudo-projective
  parsing using the HEAD decoration scheme
- changed lefts/rights in Token to account for possible non-projective
  structures
---
 setup.py                                 |   1 +
 spacy/gold.pyx                           |   2 +-
 spacy/syntax/nonproj.pxd                 |   0
 spacy/syntax/nonproj.pyx                 | 131 +++++++++++++----------
 spacy/syntax/parser.pxd                  |   1 +
 spacy/syntax/parser.pyx                  |   9 +-
 spacy/tests/{ => parser}/test_nonproj.py |  53 +++++----
 spacy/tokens/token.pyx                   |  24 +----
 8 files changed, 120 insertions(+), 101 deletions(-)
 create mode 100644 spacy/syntax/nonproj.pxd
 rename spacy/tests/{ => parser}/test_nonproj.py (60%)

diff --git a/setup.py b/setup.py
index 5c6cbbf01..d2a62dc90 100644
--- a/setup.py
+++ b/setup.py
@@ -47,6 +47,7 @@ MOD_NAMES = [
     'spacy.syntax._state',
     'spacy.tokenizer',
     'spacy.syntax.parser',
+    'spacy.syntax.nonproj',
     'spacy.syntax.transition_system',
     'spacy.syntax.arc_eager',
     'spacy.syntax._parse_features',
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 7ab034195..5c7326d12 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -14,7 +14,7 @@ try:
 except ImportError:
     import json
 
-import nonproj
+from .syntax import nonproj
 
 
 def tags_to_entities(tags):
diff --git a/spacy/syntax/nonproj.pxd b/spacy/syntax/nonproj.pxd
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx
index facf9f299..dbc5555c3 100644
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@@ -1,6 +1,10 @@
 from copy import copy
 from collections import Counter
 
+from ..tokens.doc cimport Doc
+from spacy.attrs import DEP, HEAD
+
+
 def ancestors(tokenid, heads):
     # returns all words going from the word up the path to the root
     # the path to root cannot be longer than the number of words in the sentence
@@ -55,69 +59,90 @@ def is_nonproj_tree(heads):
     return any( is_nonproj_arc(word,heads) for word in range(len(heads)) )
 
 
-class PseudoProjective:
+cdef class PseudoProjectivity:
     # implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
     # for doing pseudo-projective parsing
     # implementation uses the HEAD decoration scheme
 
-    def preprocess_training_data(self, labeled_trees, label_freq_cutoff=30):
-        # expects a sequence of pairs of head arrays and labels
+    delimiter = '||'
+
+    @classmethod
+    def decompose(cls, label):
+        return label.partition(cls.delimiter)[::2]
+
+    @classmethod
+    def is_decorated(cls, label):
+        return label.find(cls.delimiter) != -1
+
+    @classmethod
+    def preprocess_training_data(cls, gold_tuples, label_freq_cutoff=30):
         preprocessed = []
-        for heads,labels in labeled_trees:
-            proj_heads,deco_labels = self.projectivize(heads,labels)
-            # set the label to ROOT for each root dependent
-            deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ]
-            preprocessed.append((proj_heads,deco_labels))
+        freqs = Counter()
+        for raw_text, sents in gold_tuples:
+            prepro_sents = []
+            for (ids, words, tags, heads, labels, iob), ctnts in sents:
+                proj_heads,deco_labels = cls.projectivize(heads,labels)
+                # set the label to ROOT for each root dependent
+                deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ]
+                # count label frequencies
+                if label_freq_cutoff > 0:
+                    freqs.update( label for label in deco_labels if cls.is_decorated(label) )
+                prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts))
+            preprocessed.append((raw_text, prepro_sents))
 
         if label_freq_cutoff > 0:
-            return self._filter_labels(preprocessed,label_freq_cutoff)
+            return cls._filter_labels(preprocessed,label_freq_cutoff,freqs)
         return preprocessed
 
 
-    def projectivize(self, heads, labels):
+    @classmethod
+    def projectivize(cls, heads, labels):
         # use the algorithm by Nivre & Nilsson 2005
         # assumes heads to be a proper tree, i.e. connected and cycle-free
         # returns a new pair (heads,labels) which encode
         # a projective and decorated tree
         proj_heads = copy(heads)
-        smallest_np_arc = self._get_smallest_nonproj_arc(proj_heads)
+        smallest_np_arc = cls._get_smallest_nonproj_arc(proj_heads)
         if smallest_np_arc == None: # this sentence is already projective
             return proj_heads, copy(labels)
         while smallest_np_arc != None:
-            self._lift(smallest_np_arc, proj_heads)
-            smallest_np_arc = self._get_smallest_nonproj_arc(proj_heads)
-        deco_labels = self._decorate(heads, proj_heads, labels)
+            cls._lift(smallest_np_arc, proj_heads)
+            smallest_np_arc = cls._get_smallest_nonproj_arc(proj_heads)
+        deco_labels = cls._decorate(heads, proj_heads, labels)
         return proj_heads, deco_labels
 
 
-    def deprojectivize(self, heads, labels):
+    @classmethod
+    def deprojectivize(cls, Doc tokens):
         # reattach arcs with decorated labels (following HEAD scheme)
         # for each decorated arc X||Y, search top-down, left-to-right,
         # breadth-first until hitting a Y then make this the new head
-        newheads, newlabels = copy(heads), copy(labels)
-        spans = None
-        for tokenid, head in enumerate(heads):
-            if labels[tokenid].find('||') != -1:
-                newlabel,_,headlabel = labels[tokenid].partition('||')
-                newhead = self._find_new_head(head,tokenid,headlabel,heads,labels,spans=spans)
-                newheads[tokenid] = newhead
-                newlabels[tokenid] = newlabel
-        return newheads, newlabels
+        parse = tokens.to_array([HEAD, DEP])
+        labels = [ tokens.vocab.strings[int(p[1])] for p in parse ]
+        for token in tokens:
+            if cls.is_decorated(token.dep_):
+                newlabel,headlabel = cls.decompose(token.dep_)
+                newhead = cls._find_new_head(token,headlabel)
+                parse[token.i,1] = tokens.vocab.strings[newlabel]
+                parse[token.i,0] = newhead.i - token.i
+        tokens.from_array([HEAD, DEP],parse)
 
 
-    def _decorate(self, heads, proj_heads, labels):
+    @classmethod
+    def _decorate(cls, heads, proj_heads, labels):
         # uses decoration scheme HEAD from Nivre & Nilsson 2005
         assert(len(heads) == len(proj_heads) == len(labels))
         deco_labels = []
         for tokenid,head in enumerate(heads):
             if head != proj_heads[tokenid]:
-                deco_labels.append('%s||%s' % (labels[tokenid],labels[head]))
+                deco_labels.append('%s%s%s' % (labels[tokenid],cls.delimiter,labels[head]))
             else:
                 deco_labels.append(labels[tokenid])
         return deco_labels
 
 
-    def _get_smallest_nonproj_arc(self, heads):
+    @classmethod
+    def _get_smallest_nonproj_arc(cls, heads):
         # return the smallest non-proj arc or None
         # where size is defined as the distance between dep and head
         # and ties are broken left to right
@@ -131,7 +156,8 @@ class PseudoProjective:
         return smallest_np_arc
 
 
-    def _lift(self, tokenid, heads):
+    @classmethod
+    def _lift(cls, tokenid, heads):
         # reattaches a word to it's grandfather
         head = heads[tokenid]
         ghead = heads[head]
@@ -139,43 +165,36 @@ class PseudoProjective:
         heads[tokenid] = ghead if head != ghead else tokenid
 
 
-    def _find_new_head(self, rootid, tokenid, headlabel, heads, labels, spans=None):
+    @classmethod
+    def _find_new_head(cls, token, headlabel):
         # search through the tree starting from root
         # returns the id of the first descendant with the given label
         # if there is none, return the current head (no change)
-        if not spans:
-            spans = self._make_span_index(heads)
-        queue = spans.get(rootid,[])
-        queue.remove(tokenid) # don't search in the subtree of the nonproj arc
+        queue = [token.head]
         while queue:
             next_queue = []
-            for idx in queue:
-                if labels[idx] == headlabel:
-                    return idx
-                next_queue.extend(spans.get(idx,[]))
+            for qtoken in queue:
+                for child in qtoken.children:
+                    if child == token:
+                        continue
+                    if child.dep_ == headlabel:
+                        return child
+                    next_queue.append(child)
             queue = next_queue
-        return heads[tokenid]
+        return token.head
 
 
-    def _make_span_index(self, heads):
-        # stores the direct dependents for each token
-        # for searching top-down through a tree
-        spans = {}
-        for tokenid, head in enumerate(heads):
-            if tokenid == head: # root
-                continue
-            if head not in spans:
-                spans[head] = []
-            spans[head].append(tokenid)
-        return spans
-
-
-    def _filter_labels(self, labeled_trees, cutoff):
+    @classmethod
+    def _filter_labels(cls, gold_tuples, cutoff, freqs):
         # throw away infrequent decorated labels
         # can't learn them reliably anyway and keeps label set smaller
-        freqs = Counter([ label for _,labels in labeled_trees for label in labels if label.find('||') != -1 ])
         filtered = []
-        for proj_heads,deco_labels in labeled_trees:
-            filtered_labels = [ label.partition('||')[0] if freqs.get(label,cutoff) < cutoff else label for label in deco_labels ]
-            filtered.append((proj_heads,filtered_labels))
+        for raw_text, sents in gold_tuples:
+            filtered_sents = []
+            for (ids, words, tags, heads, labels, iob), ctnts in sents:
+                filtered_labels = [ cls.decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ]
+                filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
+            filtered.append((raw_text, filtered_sents))
         return filtered
+
+
diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd
index 77ea376a1..e10049fb6 100644
--- a/spacy/syntax/parser.pxd
+++ b/spacy/syntax/parser.pxd
@@ -15,5 +15,6 @@ cdef class ParserModel(AveragedPerceptron):
 cdef class Parser:
     cdef readonly ParserModel model
     cdef readonly TransitionSystem moves
+    cdef int _projectivize
 
     cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil
diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index 7a352c0d9..2e2f009fd 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -17,6 +17,7 @@ from os import path
 import shutil
 import json
 import sys
+from .nonproj import PseudoProjectivity
 
 from cymem.cymem cimport Pool, Address
 from murmurhash.mrmr cimport hash64
@@ -78,9 +79,10 @@ cdef class ParserModel(AveragedPerceptron):
 
 
 cdef class Parser:
-    def __init__(self, StringStore strings, transition_system, ParserModel model):
+    def __init__(self, StringStore strings, transition_system, ParserModel model, int projectivize = 0):
         self.moves = transition_system
         self.model = model
+        self._projectivize = projectivize
 
     @classmethod
     def from_dir(cls, model_dir, strings, transition_system):
@@ -94,7 +96,7 @@ cdef class Parser:
         model = ParserModel(templates)
         if path.exists(path.join(model_dir, 'model')):
             model.load(path.join(model_dir, 'model'))
-        return cls(strings, moves, model)
+        return cls(strings, moves, model, cfg.projectivize)
 
     @classmethod
     def load(cls, pkg_or_str_or_file, vocab):
@@ -113,6 +115,9 @@ cdef class Parser:
             tokens.is_parsed = True
         # Check for KeyboardInterrupt etc. Untested
         PyErr_CheckSignals()
+        # projectivize output
+        if self._projectivize:
+            PseudoProjectivity.deprojectivize(tokens)
 
     def pipe(self, stream, int batch_size=1000, int n_threads=2):
         cdef Pool mem = Pool()
diff --git a/spacy/tests/test_nonproj.py b/spacy/tests/parser/test_nonproj.py
similarity index 60%
rename from spacy/tests/test_nonproj.py
rename to spacy/tests/parser/test_nonproj.py
index d5290e342..443db18ae 100644
--- a/spacy/tests/test_nonproj.py
+++ b/spacy/tests/parser/test_nonproj.py
@@ -1,7 +1,13 @@
 from __future__ import unicode_literals
 import pytest
 
-from spacy.nonproj import ancestors, contains_cycle, is_nonproj_arc, is_nonproj_tree, PseudoProjective
+from spacy.tokens.doc import Doc
+from spacy.vocab import Vocab
+from spacy.tokenizer import Tokenizer
+from spacy.attrs import DEP, HEAD
+import numpy
+
+from spacy.syntax.nonproj import ancestors, contains_cycle, is_nonproj_arc, is_nonproj_tree, PseudoProjectivity
 
 def test_ancestors():
 	tree = [1,2,2,4,5,2,2]
@@ -50,52 +56,53 @@ def test_is_nonproj_tree():
 	assert(is_nonproj_tree(partial_tree) == False)
 	assert(is_nonproj_tree(multirooted_tree) == True)
 
-def test_pseudoprojective():
+def test_pseudoprojectivity():
 	tree = [1,2,2]
 	nonproj_tree = [1,2,2,4,5,2,7,4,2]
 	labels = ['NK','SB','ROOT','NK','OA','OC','SB','RC','--']
 	nonproj_tree2 = [9,1,3,1,5,6,9,8,6,1,6,12,13,10,1]
 	labels2 = ['MO','ROOT','NK','SB','MO','NK','OA','NK','AG','OC','MNR','MO','NK','NK','--']
 
-	pp = PseudoProjective()
+	assert(PseudoProjectivity.decompose('X||Y') == ('X','Y'))
+	assert(PseudoProjectivity.decompose('X') == ('X',''))
 
-	assert(pp._make_span_index(tree) == { 1:[0], 2:[1] })
-	assert(pp._make_span_index(nonproj_tree) == { 1:[0], 2:[1,5,8], 4:[3,7], 5:[4], 7:[6] })
+	assert(PseudoProjectivity.is_decorated('X||Y') == True)
+	assert(PseudoProjectivity.is_decorated('X') == False)
 
-	pp._lift(0,tree)
+	PseudoProjectivity._lift(0,tree)
 	assert(tree == [2,2,2])
 
-	np_arc = pp._get_smallest_nonproj_arc(nonproj_tree)
+	np_arc = PseudoProjectivity._get_smallest_nonproj_arc(nonproj_tree)
 	assert(np_arc == 7)
 
-	np_arc = pp._get_smallest_nonproj_arc(nonproj_tree2)
+	np_arc = PseudoProjectivity._get_smallest_nonproj_arc(nonproj_tree2)
 	assert(np_arc == 10)
 
-	proj_heads, deco_labels = pp.projectivize(nonproj_tree,labels)
+	proj_heads, deco_labels = PseudoProjectivity.projectivize(nonproj_tree,labels)
 	assert(proj_heads == [1,2,2,4,5,2,7,5,2])
 	assert(deco_labels == ['NK','SB','ROOT','NK','OA','OC','SB','RC||OA','--'])
-	deproj_heads, undeco_labels = pp.deprojectivize(proj_heads,deco_labels)
-	assert(deproj_heads == nonproj_tree)
-	assert(undeco_labels == labels)
+	# deproj_heads, undeco_labels = PseudoProjectivity.deprojectivize(proj_heads,deco_labels)
+	# assert(deproj_heads == nonproj_tree)
+	# assert(undeco_labels == labels)
 
-	proj_heads, deco_labels = pp.projectivize(nonproj_tree2,labels2)
+	proj_heads, deco_labels = PseudoProjectivity.projectivize(nonproj_tree2,labels2)
 	assert(proj_heads == [1,1,3,1,5,6,9,8,6,1,9,12,13,10,1])
 	assert(deco_labels == ['MO||OC','ROOT','NK','SB','MO','NK','OA','NK','AG','OC','MNR||OA','MO','NK','NK','--'])
-	deproj_heads, undeco_labels = pp.deprojectivize(proj_heads,deco_labels)
-	assert(deproj_heads == nonproj_tree2)
-	assert(undeco_labels == labels2)
+	# deproj_heads, undeco_labels = PseudoProjectivity.deprojectivize(proj_heads,deco_labels)
+	# assert(deproj_heads == nonproj_tree2)
+	# assert(undeco_labels == labels2)
 
 	# if decoration is wrong such that there is no head with the desired label
 	# the structure is kept and the label is undecorated
-	deproj_heads, undeco_labels = pp.deprojectivize([1,2,2,4,5,2,7,5,2],['NK','SB','ROOT','NK','OA','OC','SB','RC||DA','--'])
-	assert(deproj_heads == [1,2,2,4,5,2,7,5,2])
-	assert(undeco_labels == ['NK','SB','ROOT','NK','OA','OC','SB','RC','--'])
+	# deproj_heads, undeco_labels = PseudoProjectivity.deprojectivize([1,2,2,4,5,2,7,5,2],['NK','SB','ROOT','NK','OA','OC','SB','RC||DA','--'])
+	# assert(deproj_heads == [1,2,2,4,5,2,7,5,2])
+	# assert(undeco_labels == ['NK','SB','ROOT','NK','OA','OC','SB','RC','--'])
 
 	# if there are two potential new heads, the first one is chosen even if it's wrong
-	deproj_heads, undeco_labels = pp.deprojectivize([1,1,3,1,5,6,9,8,6,1,9,12,13,10,1], \
-		                                            ['MO||OC','ROOT','NK','OC','MO','NK','OA','NK','AG','OC','MNR||OA','MO','NK','NK','--'])
-	assert(deproj_heads == [3,1,3,1,5,6,9,8,6,1,6,12,13,10,1])
-	assert(undeco_labels == ['MO','ROOT','NK','OC','MO','NK','OA','NK','AG','OC','MNR','MO','NK','NK','--'])
+	# deproj_heads, undeco_labels = PseudoProjectivity.deprojectivize([1,1,3,1,5,6,9,8,6,1,9,12,13,10,1], \
+	# 	                                            ['MO||OC','ROOT','NK','OC','MO','NK','OA','NK','AG','OC','MNR||OA','MO','NK','NK','--'])
+	# assert(deproj_heads == [3,1,3,1,5,6,9,8,6,1,6,12,13,10,1])
+	# assert(undeco_labels == ['MO','ROOT','NK','OC','MO','NK','OA','NK','AG','OC','MNR','MO','NK','NK','--'])
 
 
 
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 342bcf409..0ff574f1b 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -201,17 +201,9 @@ cdef class Token:
             cdef int nr_iter = 0
             cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge)
             while ptr < self.c:
-                # If this head is still to the right of us, we can skip to it
-                # No token that's between this token and this head could be our
-                # child.
-                if (ptr.head >= 1) and (ptr + ptr.head) < self.c:
-                    ptr += ptr.head
-
-                elif ptr + ptr.head == self.c:
+                if ptr + ptr.head == self.c:
                     yield self.doc[ptr - (self.c - self.i)]
-                    ptr += 1
-                else:
-                    ptr += 1
+                ptr += 1
                 nr_iter += 1
                 # This is ugly, but it's a way to guard out infinite loops
                 if nr_iter >= 10000000:
@@ -226,16 +218,10 @@ cdef class Token:
             tokens = []
             cdef int nr_iter = 0
             while ptr > self.c:
-                # If this head is still to the right of us, we can skip to it
-                # No token that's between this token and this head could be our
-                # child.
-                if (ptr.head < 0) and ((ptr + ptr.head) > self.c):
-                    ptr += ptr.head
-                elif ptr + ptr.head == self.c:
+                if ptr + ptr.head == self.c:
                     tokens.append(self.doc[ptr - (self.c - self.i)])
-                    ptr -= 1
-                else:
-                    ptr -= 1
+                ptr -= 1
+                nr_iter += 1
                 if nr_iter >= 10000000:
                     raise RuntimeError(
                         "Possibly infinite loop encountered while looking for token.rights")