From a2d6b195dbb990767f93488bfae54ee98e891f50 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 28 Jan 2015 03:09:45 +1100 Subject: [PATCH 01/17] * Add messy Break transitions, carefully following the scheme of Dd Zhang et al (2013) --- spacy/syntax/arc_eager.pyx | 101 +++++++++++++++++++++++++++++++++++-- 1 file changed, 98 insertions(+), 3 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index f9ae320e5..55c20eb33 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -8,7 +8,9 @@ from ._state cimport head_in_stack, children_in_stack from ..structs cimport TokenC + DEF NON_MONOTONIC = True +DEF USE_BREAK = True cdef enum: @@ -16,6 +18,8 @@ cdef enum: REDUCE LEFT RIGHT + BREAK_SHIFT + BREAK_RIGHT N_MOVES @@ -41,6 +45,43 @@ cdef inline bint _can_reduce(const State* s) nogil: return s.stack_len >= 2 and has_head(get_s0(s)) +cdef inline bint _can_break_shift(const State* s) nogil: + cdef int i + if not USE_BREAK: + return False + elif not _can_shift(s): + return False + else: + # P. 757 + # In UPP, if Shift(F) or RightArc(F) fail to result in a single parsing + # tree, they cannot be performed as well. + seen_headless = False + for i in range(s.stack_len): + if s.sent[s.stack[i]].head == 0: + return False + return True + + +cdef inline bint _can_break_right(const State* s) nogil: + cdef int i + if not USE_BREAK: + return False + elif not _can_right(s): + return False + else: + # P. 757 + # In UPP, if Shift(F) or RightArc(F) fail to result in a single parsing + # tree, they cannot be performed as well. + seen_headless = False + for i in range(s.stack_len): + if s.sent[s.stack[i]].head == 0: + if seen_headless: + return False + else: + seen_headless = True + return True + + cdef int _shift_cost(const State* s, const int* gold) except -1: assert not at_eol(s) cost = 0 @@ -85,6 +126,28 @@ cdef int _reduce_cost(const State* s, const int* gold) except -1: return cost +cdef int _break_shift_cost(const State* s, const int* gold) except -1: + cdef int cost = _shift_cost(s, gold) + # When we break, we Reduce all of the words on the stack. So, the Break + # cost is the sum of the Reduce costs + for i in range(s.stack_len): + cost += children_in_buffer(s, s.stack[i], gold) + if NON_MONOTONIC: + cost += head_in_buffer(s, s.stack[i], gold) + return cost + + +cdef int _break_right_cost(const State* s, const int* gold) except -1: + cdef int cost = _right_cost(s, gold) + # When we break, we Reduce all of the words on the stack. So, the Break + # cost is the sum of the Reduce costs + for i in range(s.stack_len): + cost += children_in_buffer(s, s.stack[i], gold) + if NON_MONOTONIC: + cost += head_in_buffer(s, s.stack[i], gold) + return cost + + cdef class TransitionSystem: def __init__(self, list left_labels, list right_labels): self.mem = Pool() @@ -94,7 +157,7 @@ cdef class TransitionSystem: right_labels.pop(right_labels.index('ROOT')) if 'ROOT' in left_labels: left_labels.pop(left_labels.index('ROOT')) - self.n_moves = 2 + len(left_labels) + len(right_labels) + self.n_moves = 3 + len(left_labels) + len(right_labels) + len(right_labels) moves = self.mem.alloc(self.n_moves, sizeof(Transition)) cdef int i = 0 moves[i].move = SHIFT @@ -121,6 +184,17 @@ cdef class TransitionSystem: moves[i].label = label_id moves[i].clas = i i += 1 + moves[i].move = BREAK_SHIFT + moves[i].label = 0 + moves[i].clas = i + i += 1 + for label_str in right_labels: + label_str = unicode(label_str) + label_id = self.label_ids.setdefault(label_str, len(self.label_ids)) + moves[i].move = BREAK_RIGHT + moves[i].label = label_id + moves[i].clas = i + i += 1 self._moves = moves cdef int transition(self, State *s, const Transition* t) except -1: @@ -138,6 +212,23 @@ cdef class TransitionSystem: elif t.move == REDUCE: add_dep(s, s.stack[-1], s.stack[0], get_s0(s).dep) pop_stack(s) + elif t.move == BREAK_RIGHT: + add_dep(s, s.stack[0], s.i, t.label) + push_stack(s) + while s.stack_len != 0: + if not has_head(get_s0(s)): + get_s0(s).dep = 0 + s.stack -= 1 + s.stack_len -= 1 + if not at_eol(s): + push_stack(s) + elif t.move == BREAK_SHIFT: + push_stack(s) + get_s0(s).dep = 0 + s.stack -= s.stack_len + s.stack_len = 0 + if not at_eol(s): + push_stack(s) else: raise Exception(t.move) @@ -147,6 +238,8 @@ cdef class TransitionSystem: valid[LEFT] = _can_left(s) valid[RIGHT] = _can_right(s) valid[REDUCE] = _can_reduce(s) + valid[BREAK_SHIFT] = _can_break_shift(s) + valid[BREAK_RIGHT] = _can_break_right(s) cdef int best = -1 cdef weight_t score = 0 @@ -175,6 +268,8 @@ cdef class TransitionSystem: unl_costs[LEFT] = _left_cost(s, gold_heads) if _can_left(s) else -1 unl_costs[RIGHT] = _right_cost(s, gold_heads) if _can_right(s) else -1 unl_costs[REDUCE] = _reduce_cost(s, gold_heads) if _can_reduce(s) else -1 + unl_costs[BREAK_SHIFT] = _break_shift_cost(s, gold_heads) if _can_break_shift(s) else -1 + unl_costs[BREAK_RIGHT] = _break_right_cost(s, gold_heads) if _can_break_right(s) else -1 guess.cost = unl_costs[guess.move] cdef Transition t @@ -190,11 +285,11 @@ cdef class TransitionSystem: return t elif gold_heads[s.i] == s.stack[0]: target_label = gold_labels[s.i] - if guess.move == RIGHT: + if guess.move == RIGHT or guess.move == BREAK_RIGHT: guess.cost += guess.label != target_label for i in range(self.n_moves): t = self._moves[i] - if t.move == RIGHT and t.label == target_label: + if (t.move == RIGHT or t.move == BREAK_RIGHT) and t.label == target_label: return t cdef int best = -1 From 1884a7a0bed80cbc210c0e2d78e5d5eb67b60bb1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 28 Jan 2015 03:18:43 +1100 Subject: [PATCH 02/17] * Attach comment with paper --- spacy/syntax/arc_eager.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 55c20eb33..78c26d22a 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -22,6 +22,9 @@ cdef enum: BREAK_RIGHT N_MOVES +# Break transition from here +# http://www.aclweb.org/anthology/P13-1074 + cdef inline bint _can_shift(const State* s) nogil: return not at_eol(s) From 9171284d62bd0a47cf6cdc6e7ca7eb58484a2315 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 28 Jan 2015 12:27:44 +1100 Subject: [PATCH 03/17] * Fix compile-from-source instructions --- docs/source/quickstart.rst | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index c583822d0..1f0f034f4 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -12,17 +12,6 @@ Install $ pip install spacy $ python -m spacy.en.download -To compile from source: - -.. code:: bash - - $ git clone https://github.com/honnibal/spaCy.git - $ virtualenv .env && source .env/bin/activate - $ pip install -r requirements.txt - $ python -m spacy.en.download - $ fab make test - - The download command fetches and installs about 300mb of data, for the `parser model`_ and `word vectors`_, which it installs within the spacy.en package directory. @@ -30,6 +19,27 @@ and `word vectors`_, which it installs within the spacy.en package directory. .. _parser model: http://s3-us-west-1.amazonaws.com/media.spacynlp.com/en_deps-0.30.tgz +Compilation from source is currently complicated, because there's binary data +that's provided in the PyPi package, but is not in the repository. As +a temporary workaround, you can download the PyPi package and extract it from +there. I'll have a better solution shortly, probably using Github Releases. + +.. code:: bash + + $ git clone https://github.com/honnibal/spaCy.git + $ cd spaCy + $ virtualenv .env && source .env/bin/activate + $ export PYTHONPATH=`pwd` + $ pip install -r requirements.txt + $ wget https://devpi.net/root/pypi/+f/4e8/d81919a7876fe/spacy-0.33.tar.gz + $ tar -xzf spacy-0.33.tar.gz + $ cp -r spacy-0.33/spacy/en/data spacy/en/data + $ python -m spacy.en.download + $ fab make test + +Python packaging is awkward at the best of times, and it's particularly tricky +with C extensions, built via Cython, requiring large data files. So, please +bear with me :) Usage ----- From b08c0ce54e2a33e03c47dae6e00c52a60f59369c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 28 Jan 2015 13:58:33 +1100 Subject: [PATCH 04/17] * Fix numpy install problem --- setup.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/setup.py b/setup.py index 1886f70df..d61ff54ab 100644 --- a/setup.py +++ b/setup.py @@ -68,15 +68,18 @@ def c_ext(mod_name, language, includes, compile_args): extra_compile_args=compile_args, extra_link_args=compile_args) -def cython_ext(mod_name, language, includes, compile_args): +def cython_exts(mod_names, language, includes, compile_args): import Cython.Distutils import Cython.Build - mod_path = mod_name.replace('.', '/') + '.pyx' if language == 'cpp': language = 'c++' - ext = Extension(mod_name, [mod_path], language=language, include_dirs=includes, - extra_compile_args=compile_args) - return Cython.Build.cythonize([ext])[0] + exts = [] + for mod_name in mod_names: + mod_path = mod_name.replace('.', '/') + '.pyx' + e = Extension(mod_name, [mod_path], language=language, include_dirs=includes, + extra_compile_args=compile_args) + exts.append(e) + return Cython.Build.cythonize(exts) def run_setup(exts): @@ -110,10 +113,12 @@ def run_setup(exts): def main(modules, is_pypy): language = "cpp" - ext_func = cython_ext if use_cython else c_ext includes = ['.', path.join(sys.prefix, 'include')] compile_args = ['-O3'] - exts = [ext_func(mn, language, includes, compile_args) for mn in modules] + if use_cython: + exts = cython_exts(modules, language, includes, compile_args) + else: + exts = [c_ext(mn, language, includes, compile_args) for mn in modules] run_setup(exts) From f590382134ffdfa5a44d635e1c77444d9481f0e2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 29 Jan 2015 03:18:29 +1100 Subject: [PATCH 05/17] * Work on sbd --- spacy/syntax/arc_eager.pyx | 75 +++++++++++++++++++++++++++----------- spacy/syntax/parser.pyx | 27 +++++++++++--- 2 files changed, 75 insertions(+), 27 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 78c26d22a..5c5ab5356 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -52,13 +52,12 @@ cdef inline bint _can_break_shift(const State* s) nogil: cdef int i if not USE_BREAK: return False - elif not _can_shift(s): + elif at_eol(s): return False else: # P. 757 # In UPP, if Shift(F) or RightArc(F) fail to result in a single parsing # tree, they cannot be performed as well. - seen_headless = False for i in range(s.stack_len): if s.sent[s.stack[i]].head == 0: return False @@ -92,12 +91,18 @@ cdef int _shift_cost(const State* s, const int* gold) except -1: cost += children_in_stack(s, s.i, gold) if NON_MONOTONIC: cost += gold[s.stack[0]] == s.i + # If we can break, and there's no cost to doing so, we should + if _can_break_shift(s) and _break_shift_cost(s, gold) == 0: + cost += 1 return cost cdef int _right_cost(const State* s, const int* gold) except -1: assert s.stack_len >= 1 cost = 0 + # If we can break, and there's no cost to doing so, we should + if _can_break_right(s) and _break_right_cost(s, gold) == 0: + cost += 1 if gold[s.i] == s.stack[0]: return cost cost += head_in_buffer(s, s.i, gold) @@ -130,24 +135,48 @@ cdef int _reduce_cost(const State* s, const int* gold) except -1: cdef int _break_shift_cost(const State* s, const int* gold) except -1: - cdef int cost = _shift_cost(s, gold) - # When we break, we Reduce all of the words on the stack. So, the Break - # cost is the sum of the Reduce costs - for i in range(s.stack_len): - cost += children_in_buffer(s, s.stack[i], gold) - if NON_MONOTONIC: - cost += head_in_buffer(s, s.stack[i], gold) + # When we break, we Reduce all of the words on the stack. We also remove + # the first word from the buffer. + # + # n0_cost: + cdef int cost = 0 + # number of head/child deps between n0 and N1...Nn + cost += children_in_buffer(s, s.i, gold) + cost += head_in_buffer(s, s.i, gold) + # Don't count self-deps + if gold[s.i] == s.i: + cost -= 2 + # number of child deps from N0 into stack + cost += children_in_stack(s, s.i, gold) + # number of head deps to N0 from stack + cost += head_in_stack(s, s.i, gold) + # Number of deps between S0...Sn and N1...Nn + for i in range(s.i+1, s.sent_len): + cost += children_in_stack(s, i, gold) + cost += head_in_stack(s, i, gold) return cost cdef int _break_right_cost(const State* s, const int* gold) except -1: - cdef int cost = _right_cost(s, gold) - # When we break, we Reduce all of the words on the stack. So, the Break - # cost is the sum of the Reduce costs - for i in range(s.stack_len): - cost += children_in_buffer(s, s.stack[i], gold) - if NON_MONOTONIC: - cost += head_in_buffer(s, s.stack[i], gold) + cdef int cost = 0 + assert s.stack_len >= 1 + cdef int i + # When we break, we Reduce all of the words on the stack. We also remove + # the first word from the buffer. + # + # n0_cost: + # number of head/child deps between n0 and N0...Nn + cost += children_in_buffer(s, s.i, gold) + cost += head_in_buffer(s, s.i, gold) + # number of child deps from N0 into stack + cost += children_in_stack(s, s.i, gold) + # number of head deps to N0 from S1..Sn + for i in range(1, s.stack_len): + cost += s.stack[-i] == gold[s.i] + # Number of deps between S0...Sn and N1...Nn + for i in range(s.i+1, s.sent_len): + cost += children_in_stack(s, i, gold) + cost += head_in_stack(s, i, gold) return cost @@ -213,14 +242,14 @@ cdef class TransitionSystem: add_dep(s, s.stack[0], s.i, t.label) push_stack(s) elif t.move == REDUCE: + # TODO: Huh? Is this some weirdness from the non-monotonic? add_dep(s, s.stack[-1], s.stack[0], get_s0(s).dep) pop_stack(s) elif t.move == BREAK_RIGHT: add_dep(s, s.stack[0], s.i, t.label) push_stack(s) while s.stack_len != 0: - if not has_head(get_s0(s)): - get_s0(s).dep = 0 + #add_dep(s, s.stack[-1], s.stack[0], get_s0(s).dep) s.stack -= 1 s.stack_len -= 1 if not at_eol(s): @@ -228,8 +257,9 @@ cdef class TransitionSystem: elif t.move == BREAK_SHIFT: push_stack(s) get_s0(s).dep = 0 - s.stack -= s.stack_len - s.stack_len = 0 + while s.stack_len != 0: + s.stack -= 1 + s.stack_len -= 1 if not at_eol(s): push_stack(s) else: @@ -289,10 +319,11 @@ cdef class TransitionSystem: elif gold_heads[s.i] == s.stack[0]: target_label = gold_labels[s.i] if guess.move == RIGHT or guess.move == BREAK_RIGHT: - guess.cost += guess.label != target_label + if unl_costs[guess.move] != 0: + guess.cost += guess.label != target_label for i in range(self.n_moves): t = self._moves[i] - if (t.move == RIGHT or t.move == BREAK_RIGHT) and t.label == target_label: + if t.label == target_label and unl_costs[t.move] == 0: return t cdef int best = -1 diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index c93bc4cf2..61324f69c 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -41,11 +41,12 @@ def set_debug(val): cdef unicode print_state(State* s, list words): words = list(words) + ['EOL'] - top = words[s.stack[0]] - second = words[s.stack[-1]] + top = words[s.stack[0]] + '_%d' % s.sent[s.stack[0]].head + second = words[s.stack[-1]] + '_%d' % s.sent[s.stack[-1]].head + third = words[s.stack[-2]] + '_%d' % s.sent[s.stack[-2]].head n0 = words[s.i] n1 = words[s.i + 1] - return ' '.join((second, top, '|', n0, n1)) + return ' '.join((str(s.stack_len), third, second, top, '|', n0, n1)) def get_templates(name): @@ -86,7 +87,8 @@ cdef class GreedyParser: tokens.is_parsed = True return 0 - def train_sent(self, Tokens tokens, list gold_heads, list gold_labels): + def train_sent(self, Tokens tokens, list gold_heads, list gold_labels, + force_gold=False): cdef: const Feature* feats const weight_t* scores @@ -104,15 +106,30 @@ cdef class GreedyParser: labels_array[i] = self.moves.label_ids[gold_labels[i]] py_words = [t.orth_ for t in tokens] + py_moves = ['S', 'D', 'L', 'R', 'BS', 'BR'] + history = [] + #print py_words cdef State* state = init_state(mem, tokens.data, tokens.length) while not is_final(state): fill_context(context, state) scores = self.model.score(context) guess = self.moves.best_valid(scores, state) best = self.moves.best_gold(&guess, scores, state, heads_array, labels_array) + history.append((py_moves[best.move], print_state(state, py_words))) self.model.update(context, guess.clas, best.clas, guess.cost) - self.moves.transition(state, &guess) + if force_gold: + self.moves.transition(state, &best) + else: + self.moves.transition(state, &guess) cdef int n_corr = 0 for i in range(tokens.length): n_corr += (i + state.sent[i].head) == gold_heads[i] + if force_gold and n_corr != tokens.length: + print py_words + print gold_heads + for move, state_str in history: + print move, state_str + for i in range(tokens.length): + print py_words[i], py_words[i + state.sent[i].head], py_words[gold_heads[i]] + raise Exception return n_corr From 320b045daaf4189418df10f718030db1e7ac4f41 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 29 Jan 2015 03:41:58 +1100 Subject: [PATCH 06/17] * Oracle now consistent over gold standard derivation --- spacy/syntax/arc_eager.pyx | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 5c5ab5356..8130e10b0 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -58,9 +58,12 @@ cdef inline bint _can_break_shift(const State* s) nogil: # P. 757 # In UPP, if Shift(F) or RightArc(F) fail to result in a single parsing # tree, they cannot be performed as well. + seen_headless = False for i in range(s.stack_len): - if s.sent[s.stack[i]].head == 0: + if seen_headless: return False + else: + seen_headless = True return True @@ -76,7 +79,7 @@ cdef inline bint _can_break_right(const State* s) nogil: # tree, they cannot be performed as well. seen_headless = False for i in range(s.stack_len): - if s.sent[s.stack[i]].head == 0: + if s.sent[s.stack[-i]].head == 0: if seen_headless: return False else: @@ -123,6 +126,7 @@ cdef int _left_cost(const State* s, const int* gold) except -1: cost += children_in_buffer(s, s.stack[0], gold) if NON_MONOTONIC and s.stack_len >= 2: cost += gold[s.stack[0]] == s.stack[-1] + cost += gold[s.stack[0]] == s.stack[0] return cost @@ -140,18 +144,8 @@ cdef int _break_shift_cost(const State* s, const int* gold) except -1: # # n0_cost: cdef int cost = 0 - # number of head/child deps between n0 and N1...Nn - cost += children_in_buffer(s, s.i, gold) - cost += head_in_buffer(s, s.i, gold) - # Don't count self-deps - if gold[s.i] == s.i: - cost -= 2 - # number of child deps from N0 into stack - cost += children_in_stack(s, s.i, gold) - # number of head deps to N0 from stack - cost += head_in_stack(s, s.i, gold) - # Number of deps between S0...Sn and N1...Nn - for i in range(s.i+1, s.sent_len): + # Number of deps between S0...Sn and N0...Nn + for i in range(s.i, s.sent_len): cost += children_in_stack(s, i, gold) cost += head_in_stack(s, i, gold) return cost @@ -255,8 +249,6 @@ cdef class TransitionSystem: if not at_eol(s): push_stack(s) elif t.move == BREAK_SHIFT: - push_stack(s) - get_s0(s).dep = 0 while s.stack_len != 0: s.stack -= 1 s.stack_len -= 1 From b4348ce1c38eb9ccee3988372f39b8408f209a9b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 29 Jan 2015 04:21:13 +1100 Subject: [PATCH 07/17] * Messily use unsegmented sentences to train the parser --- bin/parser/train.py | 63 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 13 deletions(-) diff --git a/bin/parser/train.py b/bin/parser/train.py index 0b214a20c..67f01ee95 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -26,6 +26,7 @@ def read_tokenized_gold(file_): """Read a standard CoNLL/MALT-style format""" sents = [] for sent_str in file_.read().strip().split('\n\n'): + ids = [] words = [] heads = [] labels = [] @@ -35,10 +36,11 @@ def read_tokenized_gold(file_): words.append(word) if head_idx == -1: head_idx = i + ids.append(id_) heads.append(head_idx) labels.append(label) tags.append(pos_string) - sents.append((words, heads, labels, tags)) + sents.append((ids_, words, heads, labels, tags)) return sents @@ -49,31 +51,62 @@ def read_docparse_gold(file_): heads = [] labels = [] tags = [] + ids = [] lines = sent_str.strip().split('\n') raw_text = lines[0] tok_text = lines[1] for i, line in enumerate(lines[2:]): - word, pos_string, head_idx, label = _parse_line(line) + id_, word, pos_string, head_idx, label = _parse_line(line) + if label == 'root': + label = 'ROOT' + if pos_string == "``": + word = "``" + elif pos_string == "''": + word = "''" words.append(word) - if head_idx == -1: - head_idx = i + if head_idx < 0: + head_idx = id_ + ids.append(id_) heads.append(head_idx) labels.append(label) tags.append(pos_string) - words = tok_text.replace('', ' ').replace('', ' ').split(' ') + heads = _map_indices_to_tokens(ids, heads) + words = tok_text.replace('', ' ').replace('', ' ').split() + #print words + #print heads sents.append((words, heads, labels, tags)) + #sent_strings = tok_text.split('') + #for sent in sent_strings: + # sent_words = sent.replace('', ' ').split(' ') + # sent_heads = [] + # sent_labels = [] + # sent_tags = [] + # sent_ids = [] + # while len(sent_heads) < len(sent_words): + # sent_heads.append(heads.pop(0)) + # sent_labels.append(labels.pop(0)) + # sent_tags.append(tags.pop(0)) + # sent_ids.append(ids.pop(0)) + # sent_heads = _map_indices_to_tokens(sent_ids, sent_heads) + # sents.append((sent_words, sent_heads, sent_labels, sent_tags)) return sents +def _map_indices_to_tokens(ids, heads): + return [ids.index(head) for head in heads] + + + def _parse_line(line): pieces = line.split() if len(pieces) == 4: - return pieces[0], pieces[1], int(pieces[2]) - 1, pieces[3] + return 0, pieces[0], pieces[1], int(pieces[2]) - 1, pieces[3] else: + id_ = int(pieces[0]) word = pieces[1] pos = pieces[3] - head_idx = int(pieces[6]) - 1 + head_idx = int(pieces[6]) label = pieces[7] - return word, pos, head_idx, label + return id_, word, pos, head_idx, label def get_labels(sents): left_labels = set() @@ -113,7 +146,11 @@ def train(Language, sents, model_dir, n_iter=15, feat_set=u'basic', seed=0): tags = [nlp.tagger.tag_names.index(tag) for tag in tags] tokens = nlp.tokenizer.tokens_from_list(words) nlp.tagger(tokens) - heads_corr += nlp.parser.train_sent(tokens, heads, labels) + try: + heads_corr += nlp.parser.train_sent(tokens, heads, labels, force_gold=False) + except: + print heads + raise pos_corr += nlp.tagger.train(tokens, tags) n_tokens += len(tokens) acc = float(heads_corr) / n_tokens @@ -122,7 +159,6 @@ def train(Language, sents, model_dir, n_iter=15, feat_set=u'basic', seed=0): random.shuffle(sents) nlp.parser.model.end_training() nlp.tagger.model.end_training() - #nlp.parser.model.dump(path.join(dep_model_dir, 'model'), freq_thresh=0) return acc @@ -131,13 +167,13 @@ def evaluate(Language, dev_loc, model_dir): n_corr = 0 total = 0 with codecs.open(dev_loc, 'r', 'utf8') as file_: - sents = read_tokenized_gold(file_) + sents = read_docparse_gold(file_) for words, heads, labels, tags in sents: tokens = nlp.tokenizer.tokens_from_list(words) nlp.tagger(tokens) nlp.parser(tokens) for i, token in enumerate(tokens): - #print i, token.string, i + token.head, heads[i], labels[i] + #print i, token.orth_, token.head.orth_, tokens[heads[i]].orth_, labels[i], token.head.i == heads[i] if labels[i] == 'P' or labels[i] == 'punct': continue n_corr += token.head.i == heads[i] @@ -150,7 +186,8 @@ PROFILE = False def main(train_loc, dev_loc, model_dir): with codecs.open(train_loc, 'r', 'utf8') as file_: - train_sents = read_tokenized_gold(file_) + train_sents = read_docparse_gold(file_) + train_sents = train_sents if PROFILE: import cProfile import pstats From d05c5bf1410853ee1e43495356218ca97f14e783 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 29 Jan 2015 05:19:27 +1100 Subject: [PATCH 08/17] * Remove comment --- spacy/syntax/arc_eager.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 8130e10b0..acbc4ac87 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -243,7 +243,6 @@ cdef class TransitionSystem: add_dep(s, s.stack[0], s.i, t.label) push_stack(s) while s.stack_len != 0: - #add_dep(s, s.stack[-1], s.stack[0], get_s0(s).dep) s.stack -= 1 s.stack_len -= 1 if not at_eol(s): From ebf7d2fab1246250d0c284eeaa7d261161145f22 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 29 Jan 2015 06:22:03 +1100 Subject: [PATCH 09/17] * Use non-joint sbd, for more simplicity and fewer classes --- spacy/syntax/arc_eager.pyx | 94 ++++++-------------------------------- 1 file changed, 14 insertions(+), 80 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index acbc4ac87..80cf9deaf 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -18,8 +18,7 @@ cdef enum: REDUCE LEFT RIGHT - BREAK_SHIFT - BREAK_RIGHT + BREAK N_MOVES # Break transition from here @@ -48,35 +47,14 @@ cdef inline bint _can_reduce(const State* s) nogil: return s.stack_len >= 2 and has_head(get_s0(s)) -cdef inline bint _can_break_shift(const State* s) nogil: +cdef inline bint _can_break(const State* s) nogil: cdef int i if not USE_BREAK: return False elif at_eol(s): return False else: - # P. 757 - # In UPP, if Shift(F) or RightArc(F) fail to result in a single parsing - # tree, they cannot be performed as well. - seen_headless = False - for i in range(s.stack_len): - if seen_headless: - return False - else: - seen_headless = True - return True - - -cdef inline bint _can_break_right(const State* s) nogil: - cdef int i - if not USE_BREAK: - return False - elif not _can_right(s): - return False - else: - # P. 757 - # In UPP, if Shift(F) or RightArc(F) fail to result in a single parsing - # tree, they cannot be performed as well. + # If stack is disconnected, cannot break seen_headless = False for i in range(s.stack_len): if s.sent[s.stack[-i]].head == 0: @@ -95,7 +73,7 @@ cdef int _shift_cost(const State* s, const int* gold) except -1: if NON_MONOTONIC: cost += gold[s.stack[0]] == s.i # If we can break, and there's no cost to doing so, we should - if _can_break_shift(s) and _break_shift_cost(s, gold) == 0: + if _can_break(s) and _break_cost(s, gold) == 0: cost += 1 return cost @@ -103,9 +81,6 @@ cdef int _shift_cost(const State* s, const int* gold) except -1: cdef int _right_cost(const State* s, const int* gold) except -1: assert s.stack_len >= 1 cost = 0 - # If we can break, and there's no cost to doing so, we should - if _can_break_right(s) and _break_right_cost(s, gold) == 0: - cost += 1 if gold[s.i] == s.stack[0]: return cost cost += head_in_buffer(s, s.i, gold) @@ -138,11 +113,8 @@ cdef int _reduce_cost(const State* s, const int* gold) except -1: return cost -cdef int _break_shift_cost(const State* s, const int* gold) except -1: - # When we break, we Reduce all of the words on the stack. We also remove - # the first word from the buffer. - # - # n0_cost: +cdef int _break_cost(const State* s, const int* gold) except -1: + # When we break, we Reduce all of the words on the stack. cdef int cost = 0 # Number of deps between S0...Sn and N0...Nn for i in range(s.i, s.sent_len): @@ -151,29 +123,6 @@ cdef int _break_shift_cost(const State* s, const int* gold) except -1: return cost -cdef int _break_right_cost(const State* s, const int* gold) except -1: - cdef int cost = 0 - assert s.stack_len >= 1 - cdef int i - # When we break, we Reduce all of the words on the stack. We also remove - # the first word from the buffer. - # - # n0_cost: - # number of head/child deps between n0 and N0...Nn - cost += children_in_buffer(s, s.i, gold) - cost += head_in_buffer(s, s.i, gold) - # number of child deps from N0 into stack - cost += children_in_stack(s, s.i, gold) - # number of head deps to N0 from S1..Sn - for i in range(1, s.stack_len): - cost += s.stack[-i] == gold[s.i] - # Number of deps between S0...Sn and N1...Nn - for i in range(s.i+1, s.sent_len): - cost += children_in_stack(s, i, gold) - cost += head_in_stack(s, i, gold) - return cost - - cdef class TransitionSystem: def __init__(self, list left_labels, list right_labels): self.mem = Pool() @@ -183,7 +132,7 @@ cdef class TransitionSystem: right_labels.pop(right_labels.index('ROOT')) if 'ROOT' in left_labels: left_labels.pop(left_labels.index('ROOT')) - self.n_moves = 3 + len(left_labels) + len(right_labels) + len(right_labels) + self.n_moves = 3 + len(left_labels) + len(right_labels) moves = self.mem.alloc(self.n_moves, sizeof(Transition)) cdef int i = 0 moves[i].move = SHIFT @@ -210,17 +159,10 @@ cdef class TransitionSystem: moves[i].label = label_id moves[i].clas = i i += 1 - moves[i].move = BREAK_SHIFT + moves[i].move = BREAK moves[i].label = 0 moves[i].clas = i i += 1 - for label_str in right_labels: - label_str = unicode(label_str) - label_id = self.label_ids.setdefault(label_str, len(self.label_ids)) - moves[i].move = BREAK_RIGHT - moves[i].label = label_id - moves[i].clas = i - i += 1 self._moves = moves cdef int transition(self, State *s, const Transition* t) except -1: @@ -239,16 +181,10 @@ cdef class TransitionSystem: # TODO: Huh? Is this some weirdness from the non-monotonic? add_dep(s, s.stack[-1], s.stack[0], get_s0(s).dep) pop_stack(s) - elif t.move == BREAK_RIGHT: - add_dep(s, s.stack[0], s.i, t.label) - push_stack(s) - while s.stack_len != 0: - s.stack -= 1 - s.stack_len -= 1 - if not at_eol(s): - push_stack(s) - elif t.move == BREAK_SHIFT: + elif t.move == BREAK: while s.stack_len != 0: + if get_s0(s).head == 0: + get_s0(s).dep = 0 s.stack -= 1 s.stack_len -= 1 if not at_eol(s): @@ -262,8 +198,7 @@ cdef class TransitionSystem: valid[LEFT] = _can_left(s) valid[RIGHT] = _can_right(s) valid[REDUCE] = _can_reduce(s) - valid[BREAK_SHIFT] = _can_break_shift(s) - valid[BREAK_RIGHT] = _can_break_right(s) + valid[BREAK] = _can_break(s) cdef int best = -1 cdef weight_t score = 0 @@ -292,8 +227,7 @@ cdef class TransitionSystem: unl_costs[LEFT] = _left_cost(s, gold_heads) if _can_left(s) else -1 unl_costs[RIGHT] = _right_cost(s, gold_heads) if _can_right(s) else -1 unl_costs[REDUCE] = _reduce_cost(s, gold_heads) if _can_reduce(s) else -1 - unl_costs[BREAK_SHIFT] = _break_shift_cost(s, gold_heads) if _can_break_shift(s) else -1 - unl_costs[BREAK_RIGHT] = _break_right_cost(s, gold_heads) if _can_break_right(s) else -1 + unl_costs[BREAK] = _break_cost(s, gold_heads) if _can_break(s) else -1 guess.cost = unl_costs[guess.move] cdef Transition t @@ -309,7 +243,7 @@ cdef class TransitionSystem: return t elif gold_heads[s.i] == s.stack[0]: target_label = gold_labels[s.i] - if guess.move == RIGHT or guess.move == BREAK_RIGHT: + if guess.move == RIGHT: if unl_costs[guess.move] != 0: guess.cost += guess.label != target_label for i in range(self.n_moves): From 11ed65b93c85ac348923412d83641db3fff9ddad Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 30 Jan 2015 10:31:03 +1100 Subject: [PATCH 10/17] * Work on alignment, for evaluation with non-gold preprocessing --- bin/parser/train.py | 152 +++++++++++++++++++++++++++----------------- 1 file changed, 94 insertions(+), 58 deletions(-) diff --git a/bin/parser/train.py b/bin/parser/train.py index 67f01ee95..f41addb7f 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -45,7 +45,7 @@ def read_tokenized_gold(file_): def read_docparse_gold(file_): - sents = [] + paragraphs = [] for sent_str in file_.read().strip().split('\n\n'): words = [] heads = [] @@ -59,10 +59,6 @@ def read_docparse_gold(file_): id_, word, pos_string, head_idx, label = _parse_line(line) if label == 'root': label = 'ROOT' - if pos_string == "``": - word = "``" - elif pos_string == "''": - word = "''" words.append(word) if head_idx < 0: head_idx = id_ @@ -70,30 +66,20 @@ def read_docparse_gold(file_): heads.append(head_idx) labels.append(label) tags.append(pos_string) - heads = _map_indices_to_tokens(ids, heads) - words = tok_text.replace('', ' ').replace('', ' ').split() - #print words - #print heads - sents.append((words, heads, labels, tags)) - #sent_strings = tok_text.split('') - #for sent in sent_strings: - # sent_words = sent.replace('', ' ').split(' ') - # sent_heads = [] - # sent_labels = [] - # sent_tags = [] - # sent_ids = [] - # while len(sent_heads) < len(sent_words): - # sent_heads.append(heads.pop(0)) - # sent_labels.append(labels.pop(0)) - # sent_tags.append(tags.pop(0)) - # sent_ids.append(ids.pop(0)) - # sent_heads = _map_indices_to_tokens(sent_ids, sent_heads) - # sents.append((sent_words, sent_heads, sent_labels, sent_tags)) - return sents + tokenized = [sent_str.replace('', ' ').split(' ') + for sent_str in tok_text.split('')] + paragraphs.append((raw_text, tokenized, ids, words, tags, heads, labels)) + return paragraphs + def _map_indices_to_tokens(ids, heads): - return [ids.index(head) for head in heads] - + mapped = [] + for head in heads: + if head not in ids: + mapped.append(None) + else: + mapped.append(ids.index(head)) + return mapped def _parse_line(line): @@ -108,10 +94,71 @@ def _parse_line(line): label = pieces[7] return id_, word, pos, head_idx, label + + +def _align_annotations_to_non_gold_tokens(tokens, words, annot): + tags = [] + heads = [] + labels = [] + loss = 0 + print [t.orth_ for t in tokens] + print words + for token in tokens: + print token.orth_, words[0] + while annot and token.idx > annot[0][0]: + annot.pop(0) + words.pop(0) + loss += 1 + if not annot: + tags.append(None) + heads.append(None) + labels.append(None) + continue + id_, tag, head, label = annot[0] + if token.idx == id_: + tags.append(tag) + heads.append(head) + labels.append(label) + annot.pop(0) + words.pop(0) + elif token.idx < id_: + tags.append(None) + heads.append(None) + labels.append(None) + else: + raise StandardError + return loss, tags, heads, labels + + +def iter_data(paragraphs, tokenizer, gold_preproc=False): + for raw, tokenized, ids, words, tags, heads, labels in paragraphs: + if not gold_preproc: + tokens = tokenizer(raw) + loss, tags, heads, labels = _align_annotations_to_non_gold_tokens( + tokens, words, zip(ids, tags, heads, labels)) + ids = [t.idx for t in tokens] + heads = _map_indices_to_tokens(ids, heads) + yield tokens, tags, heads, labels + else: + assert len(words) == len(heads) + for words in tokenized: + sent_ids = ids[:len(words)] + sent_tags = tags[:len(words)] + sent_heads = heads[:len(words)] + sent_labels = labels[:len(words)] + sent_heads = _map_indices_to_tokens(sent_ids, sent_heads) + tokens = tokenizer.tokens_from_list(words) + yield tokens, sent_tags, sent_heads, sent_labels + ids = ids[len(words):] + tags = tags[len(words):] + heads = heads[len(words):] + labels = labels[len(words):] + + def get_labels(sents): left_labels = set() right_labels = set() - for _, heads, labels, _ in sents: + for raw, tokenized, ids, words, tags, heads, labels in sents: for child, (head, label) in enumerate(zip(heads, labels)): if head > child: left_labels.add(label) @@ -120,7 +167,8 @@ def get_labels(sents): return list(sorted(left_labels)), list(sorted(right_labels)) -def train(Language, sents, model_dir, n_iter=15, feat_set=u'basic', seed=0): +def train(Language, paragraphs, model_dir, n_iter=15, feat_set=u'basic', seed=0, + gold_preproc=True): dep_model_dir = path.join(model_dir, 'deps') pos_model_dir = path.join(model_dir, 'pos') if path.exists(dep_model_dir): @@ -132,7 +180,7 @@ def train(Language, sents, model_dir, n_iter=15, feat_set=u'basic', seed=0): setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir) - left_labels, right_labels = get_labels(sents) + left_labels, right_labels = get_labels(paragraphs) Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, left_labels=left_labels, right_labels=right_labels) @@ -142,62 +190,50 @@ def train(Language, sents, model_dir, n_iter=15, feat_set=u'basic', seed=0): heads_corr = 0 pos_corr = 0 n_tokens = 0 - for words, heads, labels, tags in sents: - tags = [nlp.tagger.tag_names.index(tag) for tag in tags] - tokens = nlp.tokenizer.tokens_from_list(words) + for tokens, tag_strs, heads, labels in iter_data(paragraphs, nlp.tokenizer, + gold_preproc=gold_preproc): + tags = [nlp.tagger.tag_names.index(tag) for tag in tag_strs] nlp.tagger(tokens) - try: - heads_corr += nlp.parser.train_sent(tokens, heads, labels, force_gold=False) - except: - print heads - raise + heads_corr += nlp.parser.train_sent(tokens, heads, labels, force_gold=False) pos_corr += nlp.tagger.train(tokens, tags) n_tokens += len(tokens) acc = float(heads_corr) / n_tokens pos_acc = float(pos_corr) / n_tokens print '%d: ' % itn, '%.3f' % acc, '%.3f' % pos_acc - random.shuffle(sents) + random.shuffle(paragraphs) nlp.parser.model.end_training() nlp.tagger.model.end_training() return acc -def evaluate(Language, dev_loc, model_dir): +def evaluate(Language, dev_loc, model_dir, gold_preproc=False): nlp = Language() n_corr = 0 total = 0 + skipped = 0 with codecs.open(dev_loc, 'r', 'utf8') as file_: - sents = read_docparse_gold(file_) - for words, heads, labels, tags in sents: - tokens = nlp.tokenizer.tokens_from_list(words) + paragraphs = read_docparse_gold(file_) + for tokens, tag_strs, heads, labels in iter_data(paragraphs, nlp.tokenizer, + gold_preproc=gold_preproc): + assert len(tokens) == len(labels) nlp.tagger(tokens) nlp.parser(tokens) for i, token in enumerate(tokens): - #print i, token.orth_, token.head.orth_, tokens[heads[i]].orth_, labels[i], token.head.i == heads[i] + if heads[i] is None: + skipped += 1 if labels[i] == 'P' or labels[i] == 'punct': continue n_corr += token.head.i == heads[i] total += 1 + print skipped return float(n_corr) / total -PROFILE = False - - def main(train_loc, dev_loc, model_dir): with codecs.open(train_loc, 'r', 'utf8') as file_: train_sents = read_docparse_gold(file_) - train_sents = train_sents - if PROFILE: - import cProfile - import pstats - cmd = "train(EN, train_sents, tag_names, model_dir, n_iter=2)" - cProfile.runctx(cmd, globals(), locals(), "Profile.prof") - s = pstats.Stats("Profile.prof") - s.strip_dirs().sort_stats("time").print_stats() - else: - train(English, train_sents, model_dir) - print evaluate(English, dev_loc, model_dir) + #train(English, train_sents, model_dir, gold_preproc=False) + print evaluate(English, dev_loc, model_dir, gold_preproc=False) if __name__ == '__main__': From 5458f220f8c2b4df4cad55f635ceac964db1b130 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 30 Jan 2015 10:31:25 +1100 Subject: [PATCH 11/17] * Fix quickstart instructions --- docs/source/quickstart.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index 1f0f034f4..51789fe55 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -34,8 +34,9 @@ there. I'll have a better solution shortly, probably using Github Releases. $ wget https://devpi.net/root/pypi/+f/4e8/d81919a7876fe/spacy-0.33.tar.gz $ tar -xzf spacy-0.33.tar.gz $ cp -r spacy-0.33/spacy/en/data spacy/en/data + $ fab make $ python -m spacy.en.download - $ fab make test + $ fab test Python packaging is awkward at the best of times, and it's particularly tricky with C extensions, built via Cython, requiring large data files. So, please From b38093237e24085a788214ac8b7093b64eb00bf9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 30 Jan 2015 11:15:54 +1100 Subject: [PATCH 12/17] * More debug prints --- bin/parser/train.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/parser/train.py b/bin/parser/train.py index f41addb7f..d901a914f 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -105,7 +105,9 @@ def _align_annotations_to_non_gold_tokens(tokens, words, annot): print words for token in tokens: print token.orth_, words[0] + print token.idx, annot[0][0] while annot and token.idx > annot[0][0]: + print 'pop', token.idx, annot[0][0] annot.pop(0) words.pop(0) loss += 1 From 0a7fcebdf7d05ec961bd940c6217988e34b1fced Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 30 Jan 2015 12:33:38 +1100 Subject: [PATCH 13/17] * Fix Issue #12: Incorrect token.idx calculations for some punctuation, in the presence of token cache --- spacy/tokenizer.pyx | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index f540eeb88..0f96c058e 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -95,7 +95,6 @@ cdef class Tokenizer: return tokens cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1: - #cached = self._specials.get(key) cached = <_Cached*>self._cache.get(key) if cached == NULL: return False @@ -176,7 +175,12 @@ cdef class Tokenizer: if string.n != 0: cache_hit = self._try_cache(idx, string.key, tokens) if cache_hit: - idx = tokens.data[tokens.length - 1].idx + 1 + # Get last idx + idx = tokens.data[tokens.length - 1].idx + # Increment by last length + idx += tokens.data[tokens.length - 1].lex.length + # Add 1 for space + idx += 1 else: split = self._find_infix(string.chars, string.n) if split == 0 or split == -1: From d0e08a5b57a3253e7b3d184df9afe81b21d6610f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 30 Jan 2015 12:35:13 +1100 Subject: [PATCH 14/17] * Upd index tests --- tests/test_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 1f195e5e7..58bb1afaf 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -8,7 +8,7 @@ from spacy.en import English @pytest.fixture def EN(): - return English() + return English().tokenizer def test_single_word(EN): tokens = EN(u'hello') From 4ff180db7404e703c49c06c4f155d637985953b0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 30 Jan 2015 12:49:33 +1100 Subject: [PATCH 15/17] * Fix off-by-one error in commit 0a7fceb --- spacy/tokenizer.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 0f96c058e..1f7228c9b 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -179,8 +179,6 @@ cdef class Tokenizer: idx = tokens.data[tokens.length - 1].idx # Increment by last length idx += tokens.data[tokens.length - 1].lex.length - # Add 1 for space - idx += 1 else: split = self._find_infix(string.chars, string.n) if split == 0 or split == -1: From 67d6e53a6959ec0e3c7c30fff95462c8bba54102 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 30 Jan 2015 14:08:56 +1100 Subject: [PATCH 16/17] * Ensure parser and tagger function correctly when training from missing values, indicated by -1 --- spacy/en/pos.pyx | 10 +++++++--- spacy/syntax/parser.pyx | 9 +++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx index 1e19b9b82..d8d1685b2 100644 --- a/spacy/en/pos.pyx +++ b/spacy/en/pos.pyx @@ -255,19 +255,23 @@ cdef class EnPosTagger: tokens._tag_strings = self.tag_names tokens.is_tagged = True - def train(self, Tokens tokens, object golds): + def train(self, Tokens tokens, object gold_tag_strs): cdef int i + cdef int loss cdef atom_t[N_CONTEXT_FIELDS] context cdef const weight_t* scores + golds = [self.tag_names.index(g) if g is not None else -1 + for g in gold_tag_strs] correct = 0 for i in range(tokens.length): fill_context(context, i, tokens.data) scores = self.model.score(context) guess = arg_max(scores, self.model.n_classes) - self.model.update(context, guess, golds[i], guess != golds[i]) + loss = guess != golds[i] if golds[i] != -1 else 0 + self.model.update(context, guess, golds[i], loss) tokens.data[i].tag = guess self.set_morph(i, tokens.data) - correct += guess == golds[i] + correct += loss == 0 return correct cdef int set_morph(self, const int i, TokenC* tokens) except -1: diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 61324f69c..4144e93cd 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -102,8 +102,12 @@ cdef class GreedyParser: cdef int* labels_array = mem.alloc(tokens.length, sizeof(int)) cdef int i for i in range(tokens.length): - heads_array[i] = gold_heads[i] - labels_array[i] = self.moves.label_ids[gold_labels[i]] + if gold_heads[i] is None: + heads_array[i] = -1 + labels_array[i] = -1 + else: + heads_array[i] = gold_heads[i] + labels_array[i] = self.moves.label_ids[gold_labels[i]] py_words = [t.orth_ for t in tokens] py_moves = ['S', 'D', 'L', 'R', 'BS', 'BR'] @@ -123,6 +127,7 @@ cdef class GreedyParser: self.moves.transition(state, &guess) cdef int n_corr = 0 for i in range(tokens.length): + if gold_heads[i] != -1: n_corr += (i + state.sent[i].head) == gold_heads[i] if force_gold and n_corr != tokens.length: print py_words From ca7577d8a9a489cd086b37478c5921940c414814 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 30 Jan 2015 16:36:24 +1100 Subject: [PATCH 17/17] * Allow parsers and taggers to be trained on text without gold pre-processing. --- bin/parser/train.py | 43 +++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/bin/parser/train.py b/bin/parser/train.py index d901a914f..eb83edb63 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -9,6 +9,7 @@ import codecs import random import time import gzip +import nltk import plac import cProfile @@ -22,6 +23,10 @@ from spacy.syntax.parser import GreedyParser from spacy.syntax.util import Config +def is_punct_label(label): + return label == 'P' or label.lower() == 'punct' + + def read_tokenized_gold(file_): """Read a standard CoNLL/MALT-style format""" sents = [] @@ -96,21 +101,21 @@ def _parse_line(line): +loss = 0 def _align_annotations_to_non_gold_tokens(tokens, words, annot): + global loss tags = [] heads = [] labels = [] - loss = 0 - print [t.orth_ for t in tokens] - print words + orig_words = list(words) + missed = [] for token in tokens: - print token.orth_, words[0] - print token.idx, annot[0][0] while annot and token.idx > annot[0][0]: - print 'pop', token.idx, annot[0][0] - annot.pop(0) - words.pop(0) - loss += 1 + miss_id, miss_tag, miss_head, miss_label = annot.pop(0) + miss_w = words.pop(0) + if not is_punct_label(miss_label): + missed.append(miss_w) + loss += 1 if not annot: tags.append(None) heads.append(None) @@ -129,6 +134,11 @@ def _align_annotations_to_non_gold_tokens(tokens, words, annot): labels.append(None) else: raise StandardError + #if missed: + # print orig_words + # print missed + # for t in tokens: + # print t.idx, t.orth_ return loss, tags, heads, labels @@ -137,7 +147,8 @@ def iter_data(paragraphs, tokenizer, gold_preproc=False): if not gold_preproc: tokens = tokenizer(raw) loss, tags, heads, labels = _align_annotations_to_non_gold_tokens( - tokens, words, zip(ids, tags, heads, labels)) + tokens, list(words), + zip(ids, tags, heads, labels)) ids = [t.idx for t in tokens] heads = _map_indices_to_tokens(ids, heads) yield tokens, tags, heads, labels @@ -170,7 +181,7 @@ def get_labels(sents): def train(Language, paragraphs, model_dir, n_iter=15, feat_set=u'basic', seed=0, - gold_preproc=True): + gold_preproc=False): dep_model_dir = path.join(model_dir, 'deps') pos_model_dir = path.join(model_dir, 'pos') if path.exists(dep_model_dir): @@ -194,10 +205,9 @@ def train(Language, paragraphs, model_dir, n_iter=15, feat_set=u'basic', seed=0, n_tokens = 0 for tokens, tag_strs, heads, labels in iter_data(paragraphs, nlp.tokenizer, gold_preproc=gold_preproc): - tags = [nlp.tagger.tag_names.index(tag) for tag in tag_strs] nlp.tagger(tokens) heads_corr += nlp.parser.train_sent(tokens, heads, labels, force_gold=False) - pos_corr += nlp.tagger.train(tokens, tags) + pos_corr += nlp.tagger.train(tokens, tag_strs) n_tokens += len(tokens) acc = float(heads_corr) / n_tokens pos_acc = float(pos_corr) / n_tokens @@ -223,12 +233,13 @@ def evaluate(Language, dev_loc, model_dir, gold_preproc=False): for i, token in enumerate(tokens): if heads[i] is None: skipped += 1 - if labels[i] == 'P' or labels[i] == 'punct': + continue + if is_punct_label(labels[i]): continue n_corr += token.head.i == heads[i] total += 1 - print skipped - return float(n_corr) / total + print loss, skipped, (loss+skipped + total) + return float(n_corr) / (total + loss) def main(train_loc, dev_loc, model_dir):