mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Tidy up syntax
This commit is contained in:
		
							parent
							
								
									5167a0cce2
								
							
						
					
					
						commit
						b4d226a3f1
					
				|  | @ -2,7 +2,7 @@ | ||||||
| # cython: profile=True | # cython: profile=True | ||||||
| cimport numpy as np | cimport numpy as np | ||||||
| import numpy | import numpy | ||||||
| from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF | from cpython.ref cimport PyObject, Py_XDECREF | ||||||
| from thinc.extra.search cimport Beam | from thinc.extra.search cimport Beam | ||||||
| from thinc.extra.search import MaxViolation | from thinc.extra.search import MaxViolation | ||||||
| from thinc.typedefs cimport hash_t, class_t | from thinc.typedefs cimport hash_t, class_t | ||||||
|  | @ -11,7 +11,6 @@ from thinc.extra.search cimport MaxViolation | ||||||
| from .transition_system cimport TransitionSystem, Transition | from .transition_system cimport TransitionSystem, Transition | ||||||
| from .stateclass cimport StateClass | from .stateclass cimport StateClass | ||||||
| from ..gold cimport GoldParse | from ..gold cimport GoldParse | ||||||
| from ..tokens.doc cimport Doc |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # These are passed as callbacks to thinc.search.Beam | # These are passed as callbacks to thinc.search.Beam | ||||||
|  | @ -50,7 +49,7 @@ cdef class ParserBeam(object): | ||||||
|     cdef public object dones |     cdef public object dones | ||||||
| 
 | 
 | ||||||
|     def __init__(self, TransitionSystem moves, states, golds, |     def __init__(self, TransitionSystem moves, states, golds, | ||||||
|             int width, float density): |                  int width, float density): | ||||||
|         self.moves = moves |         self.moves = moves | ||||||
|         self.states = states |         self.states = states | ||||||
|         self.golds = golds |         self.golds = golds | ||||||
|  | @ -59,7 +58,8 @@ cdef class ParserBeam(object): | ||||||
|         cdef StateClass state, st |         cdef StateClass state, st | ||||||
|         for state in states: |         for state in states: | ||||||
|             beam = Beam(self.moves.n_moves, width, density) |             beam = Beam(self.moves.n_moves, width, density) | ||||||
|             beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent) |             beam.initialize(self.moves.init_beam_state, state.c.length, | ||||||
|  |                             state.c._sent) | ||||||
|             for i in range(beam.width): |             for i in range(beam.width): | ||||||
|                 st = <StateClass>beam.at(i) |                 st = <StateClass>beam.at(i) | ||||||
|                 st.c.offset = state.c.offset |                 st.c.offset = state.c.offset | ||||||
|  | @ -74,7 +74,8 @@ cdef class ParserBeam(object): | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def is_done(self): |     def is_done(self): | ||||||
|         return all(b.is_done or self.dones[i] for i, b in enumerate(self.beams)) |         return all(b.is_done or self.dones[i] | ||||||
|  |                    for i, b in enumerate(self.beams)) | ||||||
| 
 | 
 | ||||||
|     def __getitem__(self, i): |     def __getitem__(self, i): | ||||||
|         return self.beams[i] |         return self.beams[i] | ||||||
|  | @ -126,7 +127,8 @@ cdef class ParserBeam(object): | ||||||
|         for i in range(beam.size): |         for i in range(beam.size): | ||||||
|             state = <StateClass>beam.at(i) |             state = <StateClass>beam.at(i) | ||||||
|             if not state.c.is_final(): |             if not state.c.is_final(): | ||||||
|                 self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold) |                 self.moves.set_costs(beam.is_valid[i], beam.costs[i], | ||||||
|  |                                      state, gold) | ||||||
|                 if follow_gold: |                 if follow_gold: | ||||||
|                     for j in range(beam.nr_class): |                     for j in range(beam.nr_class): | ||||||
|                         if beam.costs[i][j] >= 1: |                         if beam.costs[i][j] >= 1: | ||||||
|  | @ -146,7 +148,10 @@ def get_token_ids(states, int n_tokens): | ||||||
|         c_ids += ids.shape[1] |         c_ids += ids.shape[1] | ||||||
|     return ids |     return ids | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| nr_update = 0 | nr_update = 0 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def update_beam(TransitionSystem moves, int nr_feature, int max_steps, | def update_beam(TransitionSystem moves, int nr_feature, int max_steps, | ||||||
|                 states, golds, |                 states, golds, | ||||||
|                 state2vec, vec2scores, |                 state2vec, vec2scores, | ||||||
|  | @ -167,23 +172,27 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps, | ||||||
|         if pbeam.is_done and gbeam.is_done: |         if pbeam.is_done and gbeam.is_done: | ||||||
|             break |             break | ||||||
|         # The beam maps let us find the right row in the flattened scores |         # The beam maps let us find the right row in the flattened scores | ||||||
|         # arrays for each state. States are identified by (example id, history). |         # arrays for each state. States are identified by (example id, | ||||||
|         # We keep a different beam map for each step (since we'll have a flat |         # history). We keep a different beam map for each step (since we'll | ||||||
|         # scores array for each step). The beam map will let us take the per-state |         # have a flat scores array for each step). The beam map will let us | ||||||
|         # losses, and compute the gradient for each (step, state, class). |         # take the per-state losses, and compute the gradient for each (step, | ||||||
|  |         # state, class). | ||||||
|         beam_maps.append({}) |         beam_maps.append({}) | ||||||
|         # Gather all states from the two beams in a list. Some stats may occur |         # Gather all states from the two beams in a list. Some stats may occur | ||||||
|         # in both beams. To figure out which beam each state belonged to, |         # in both beams. To figure out which beam each state belonged to, | ||||||
|         # we keep two lists of indices, p_indices and g_indices |         # we keep two lists of indices, p_indices and g_indices | ||||||
|         states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update) |         states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], | ||||||
|  |                                                   nr_update) | ||||||
|         if not states: |         if not states: | ||||||
|             break |             break | ||||||
|         # Now that we have our flat list of states, feed them through the model |         # Now that we have our flat list of states, feed them through the model | ||||||
|         token_ids = get_token_ids(states, nr_feature) |         token_ids = get_token_ids(states, nr_feature) | ||||||
|         vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop) |         vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop) | ||||||
|         if hist_feats: |         if hist_feats: | ||||||
|             hists = numpy.asarray([st.history[:hist_feats] for st in states], dtype='i') |             hists = numpy.asarray([st.history[:hist_feats] for st in states], | ||||||
|             scores, bp_scores = vec2scores.begin_update((vectors, hists), drop=drop) |                                   dtype='i') | ||||||
|  |             scores, bp_scores = vec2scores.begin_update((vectors, hists), | ||||||
|  |                                                         drop=drop) | ||||||
|         else: |         else: | ||||||
|             scores, bp_scores = vec2scores.begin_update(vectors, drop=drop) |             scores, bp_scores = vec2scores.begin_update(vectors, drop=drop) | ||||||
| 
 | 
 | ||||||
|  | @ -192,8 +201,10 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps, | ||||||
| 
 | 
 | ||||||
|         # Unpack the flat scores into lists for the two beams. The indices arrays |         # Unpack the flat scores into lists for the two beams. The indices arrays | ||||||
|         # tell us which example and state the scores-row refers to. |         # tell us which example and state the scores-row refers to. | ||||||
|         p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices] |         p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') | ||||||
|         g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')  for indices in g_indices] |                     for indices in p_indices] | ||||||
|  |         g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') | ||||||
|  |                     for indices in g_indices] | ||||||
|         # Now advance the states in the beams. The gold beam is contrained to |         # Now advance the states in the beams. The gold beam is contrained to | ||||||
|         # to follow only gold analyses. |         # to follow only gold analyses. | ||||||
|         pbeam.advance(p_scores) |         pbeam.advance(p_scores) | ||||||
|  | @ -249,8 +260,7 @@ def get_states(pbeams, gbeams, beam_map, nr_update): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_gradient(nr_class, beam_maps, histories, losses): | def get_gradient(nr_class, beam_maps, histories, losses): | ||||||
|     """ |     """The global model assigns a loss to each parse. The beam scores | ||||||
|     The global model assigns a loss to each parse. The beam scores |  | ||||||
|     are additive, so the same gradient is applied to each action |     are additive, so the same gradient is applied to each action | ||||||
|     in the history. This gives the gradient of a single *action* |     in the history. This gives the gradient of a single *action* | ||||||
|     for a beam state -- so we have "the gradient of loss for taking |     for a beam state -- so we have "the gradient of loss for taking | ||||||
|  | @ -270,7 +280,8 @@ def get_gradient(nr_class, beam_maps, histories, losses): | ||||||
|             if loss != 0.0 and not numpy.isnan(loss): |             if loss != 0.0 and not numpy.isnan(loss): | ||||||
|                 nr_step = max(nr_step, len(hist)) |                 nr_step = max(nr_step, len(hist)) | ||||||
|     for i in range(nr_step): |     for i in range(nr_step): | ||||||
|         grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), dtype='f')) |         grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), | ||||||
|  |                                  dtype='f')) | ||||||
|     assert len(histories) == len(losses) |     assert len(histories) == len(losses) | ||||||
|     for eg_id, hists in enumerate(histories): |     for eg_id, hists in enumerate(histories): | ||||||
|         for loss, hist in zip(losses[eg_id], hists): |         for loss, hist in zip(losses[eg_id], hists): | ||||||
|  | @ -287,5 +298,3 @@ def get_gradient(nr_class, beam_maps, histories, losses): | ||||||
|                 grads[j][i, clas] += loss |                 grads[j][i, clas] += loss | ||||||
|                 key = key + tuple([clas]) |                 key = key + tuple([clas]) | ||||||
|     return grads |     return grads | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|  |  | ||||||
|  | @ -1 +0,0 @@ | ||||||
| # test |  | ||||||
|  | @ -4,24 +4,16 @@ | ||||||
| # coding: utf-8 | # coding: utf-8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF | from cpython.ref cimport Py_INCREF | ||||||
| import ctypes |  | ||||||
| from libc.stdint cimport uint32_t |  | ||||||
| from libc.string cimport memcpy |  | ||||||
| from cymem.cymem cimport Pool | from cymem.cymem cimport Pool | ||||||
| from collections import OrderedDict | from collections import OrderedDict | ||||||
| from thinc.extra.search cimport Beam | from thinc.extra.search cimport Beam | ||||||
| import numpy |  | ||||||
| 
 | 
 | ||||||
| from .stateclass cimport StateClass | from .stateclass cimport StateClass | ||||||
| from ._state cimport StateC, is_space_token | from ._state cimport StateC | ||||||
| from .nonproj import is_nonproj_tree | from .nonproj import is_nonproj_tree | ||||||
| from .transition_system cimport do_func_t, get_cost_func_t |  | ||||||
| from .transition_system cimport move_cost_func_t, label_cost_func_t | from .transition_system cimport move_cost_func_t, label_cost_func_t | ||||||
| from ..gold cimport GoldParse | from ..gold cimport GoldParse, GoldParseC | ||||||
| from ..gold cimport GoldParseC |  | ||||||
| from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE, IS_PUNCT |  | ||||||
| from ..lexeme cimport Lexeme |  | ||||||
| from ..structs cimport TokenC | from ..structs cimport TokenC | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -316,14 +308,13 @@ cdef class ArcEager(TransitionSystem): | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def get_actions(cls, **kwargs): |     def get_actions(cls, **kwargs): | ||||||
|         actions = kwargs.get('actions', |         actions = kwargs.get('actions', OrderedDict(( | ||||||
|                     OrderedDict(( |             (SHIFT, ['']), | ||||||
|                         (SHIFT, ['']), |             (REDUCE, ['']), | ||||||
|                         (REDUCE, ['']), |             (RIGHT, []), | ||||||
|                         (RIGHT, []), |             (LEFT, []), | ||||||
|                         (LEFT, []), |             (BREAK, ['ROOT'])) | ||||||
|                         (BREAK, ['ROOT']) |         )) | ||||||
|                     ))) |  | ||||||
|         seen_actions = set() |         seen_actions = set() | ||||||
|         for label in kwargs.get('left_labels', []): |         for label in kwargs.get('left_labels', []): | ||||||
|             if label.upper() != 'ROOT': |             if label.upper() != 'ROOT': | ||||||
|  | @ -363,7 +354,8 @@ cdef class ArcEager(TransitionSystem): | ||||||
|             if gold.cand_to_gold[i] is None: |             if gold.cand_to_gold[i] is None: | ||||||
|                 continue |                 continue | ||||||
|             if state.safe_get(i).dep: |             if state.safe_get(i).dep: | ||||||
|                 predicted.add((i, state.H(i), self.strings[state.safe_get(i).dep])) |                 predicted.add((i, state.H(i), | ||||||
|  |                               self.strings[state.safe_get(i).dep])) | ||||||
|             else: |             else: | ||||||
|                 predicted.add((i, state.H(i), 'ROOT')) |                 predicted.add((i, state.H(i), 'ROOT')) | ||||||
|             id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]] |             id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]] | ||||||
|  | @ -381,7 +373,8 @@ cdef class ArcEager(TransitionSystem): | ||||||
|         if not self.has_gold(gold): |         if not self.has_gold(gold): | ||||||
|             return None |             return None | ||||||
|         for i in range(gold.length): |         for i in range(gold.length): | ||||||
|             if gold.heads[i] is None or gold.labels[i] is None: # Missing values |             # Missing values | ||||||
|  |             if gold.heads[i] is None or gold.labels[i] is None: | ||||||
|                 gold.c.heads[i] = i |                 gold.c.heads[i] = i | ||||||
|                 gold.c.has_dep[i] = False |                 gold.c.has_dep[i] = False | ||||||
|             else: |             else: | ||||||
|  | @ -517,14 +510,15 @@ cdef class ArcEager(TransitionSystem): | ||||||
|             # Check projectivity --- leading cause |             # Check projectivity --- leading cause | ||||||
|             if is_nonproj_tree(gold.heads): |             if is_nonproj_tree(gold.heads): | ||||||
|                 raise ValueError( |                 raise ValueError( | ||||||
|                     "Could not find a gold-standard action to supervise the dependency " |                     "Could not find a gold-standard action to supervise the " | ||||||
|                     "parser.\n" |                     "dependency parser. Likely cause: the tree is " | ||||||
|                     "Likely cause: the tree is non-projective (i.e. it has crossing " |                     "non-projective (i.e. it has crossing arcs -- see " | ||||||
|                     "arcs -- see spacy/syntax/nonproj.pyx for definitions)\n" |                     "spacy/syntax/nonproj.pyx for definitions). The ArcEager " | ||||||
|                     "The ArcEager transition system only supports projective trees.\n" |                     "transition system only supports projective trees. To " | ||||||
|                     "To learn non-projective representations, transform the data " |                     "learn non-projective representations, transform the data " | ||||||
|                     "before training and after parsing. Either pass make_projective=True " |                     "before training and after parsing. Either pass " | ||||||
|                     "to the GoldParse class, or use PseudoProjectivity.preprocess_training_data") |                     "make_projective=True to the GoldParse class, or use " | ||||||
|  |                     "spacy.syntax.nonproj.preprocess_training_data.") | ||||||
|             else: |             else: | ||||||
|                 print(gold.orig_annot) |                 print(gold.orig_annot) | ||||||
|                 print(gold.words) |                 print(gold.words) | ||||||
|  | @ -532,12 +526,10 @@ cdef class ArcEager(TransitionSystem): | ||||||
|                 print(gold.labels) |                 print(gold.labels) | ||||||
|                 print(gold.sent_starts) |                 print(gold.sent_starts) | ||||||
|                 raise ValueError( |                 raise ValueError( | ||||||
|                     "Could not find a gold-standard action to supervise the dependency " |                     "Could not find a gold-standard action to supervise the" | ||||||
|                     "parser.\n" |                     "dependency parser. The GoldParse was projective. The " | ||||||
|                     "The GoldParse was projective.\n" |                     "transition system has %d actions. State at failure: %s" | ||||||
|                     "The transition system has %d actions.\n" |                     % (self.n_moves, stcls.print_state(gold.words))) | ||||||
|                     "State at failure:\n" |  | ||||||
|                     "%s" % (self.n_moves, stcls.print_state(gold.words))) |  | ||||||
|         assert n_gold >= 1 |         assert n_gold >= 1 | ||||||
| 
 | 
 | ||||||
|     def get_beam_annot(self, Beam beam): |     def get_beam_annot(self, Beam beam): | ||||||
|  | @ -558,4 +550,3 @@ cdef class ArcEager(TransitionSystem): | ||||||
|                     deps[j].setdefault(dep, 0.0) |                     deps[j].setdefault(dep, 0.0) | ||||||
|                     deps[j][dep] += prob |                     deps[j][dep] += prob | ||||||
|         return heads, deps |         return heads, deps | ||||||
| 
 |  | ||||||
|  |  | ||||||
|  | @ -4,17 +4,12 @@ from __future__ import unicode_literals | ||||||
| from thinc.typedefs cimport weight_t | from thinc.typedefs cimport weight_t | ||||||
| from thinc.extra.search cimport Beam | from thinc.extra.search cimport Beam | ||||||
| from collections import OrderedDict | from collections import OrderedDict | ||||||
| import numpy |  | ||||||
| from thinc.neural.ops import NumpyOps |  | ||||||
| 
 | 
 | ||||||
| from .stateclass cimport StateClass | from .stateclass cimport StateClass | ||||||
| from ._state cimport StateC | from ._state cimport StateC | ||||||
| from .transition_system cimport Transition | from .transition_system cimport Transition | ||||||
| from .transition_system cimport do_func_t | from .transition_system cimport do_func_t | ||||||
| from ..structs cimport TokenC, Entity | from ..gold cimport GoldParseC, GoldParse | ||||||
| from ..gold cimport GoldParseC |  | ||||||
| from ..gold cimport GoldParse |  | ||||||
| from ..attrs cimport ENT_TYPE, ENT_IOB |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef enum: | cdef enum: | ||||||
|  | @ -69,15 +64,14 @@ cdef class BiluoPushDown(TransitionSystem): | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def get_actions(cls, **kwargs): |     def get_actions(cls, **kwargs): | ||||||
|         actions = kwargs.get('actions', |         actions = kwargs.get('actions', OrderedDict(( | ||||||
|                     OrderedDict(( |             (MISSING, ['']), | ||||||
|                         (MISSING, ['']), |             (BEGIN, []), | ||||||
|                         (BEGIN, []), |             (IN, []), | ||||||
|                         (IN, []), |             (LAST, []), | ||||||
|                         (LAST, []), |             (UNIT, []), | ||||||
|                         (UNIT, []), |             (OUT, ['']) | ||||||
|                         (OUT, ['']) |         ))) | ||||||
|                     ))) |  | ||||||
|         seen_entities = set() |         seen_entities = set() | ||||||
|         for entity_type in kwargs.get('entity_types', []): |         for entity_type in kwargs.get('entity_types', []): | ||||||
|             if entity_type in seen_entities: |             if entity_type in seen_entities: | ||||||
|  | @ -160,7 +154,7 @@ cdef class BiluoPushDown(TransitionSystem): | ||||||
| 
 | 
 | ||||||
|     cdef Transition lookup_transition(self, object name) except *: |     cdef Transition lookup_transition(self, object name) except *: | ||||||
|         cdef attr_t label |         cdef attr_t label | ||||||
|         if name == '-' or name == None: |         if name == '-' or name is None: | ||||||
|             return Transition(clas=0, move=MISSING, label=0, score=0) |             return Transition(clas=0, move=MISSING, label=0, score=0) | ||||||
|         elif name == '!O': |         elif name == '!O': | ||||||
|             return Transition(clas=0, move=ISNT, label=0, score=0) |             return Transition(clas=0, move=ISNT, label=0, score=0) | ||||||
|  | @ -328,8 +322,8 @@ cdef class In: | ||||||
|             return False |             return False | ||||||
|         elif preset_ent_iob == 3: |         elif preset_ent_iob == 3: | ||||||
|             return False |             return False | ||||||
|         # TODO: Is this quite right? |         # TODO: Is this quite right? I think it's supposed to be ensuring the | ||||||
|         # I think it's supposed to be ensuring the gazetteer matches are maintained |         # gazetteer matches are maintained | ||||||
|         elif st.B_(1).ent_iob != preset_ent_iob: |         elif st.B_(1).ent_iob != preset_ent_iob: | ||||||
|             return False |             return False | ||||||
|         # Don't allow entities to extend across sentence boundaries |         # Don't allow entities to extend across sentence boundaries | ||||||
|  | @ -354,10 +348,12 @@ cdef class In: | ||||||
|         if g_act == MISSING: |         if g_act == MISSING: | ||||||
|             return 0 |             return 0 | ||||||
|         elif g_act == BEGIN: |         elif g_act == BEGIN: | ||||||
|             # I, Gold B --> True (P of bad open entity sunk, R of this entity sunk) |             # I, Gold B --> True | ||||||
|  |             # (P of bad open entity sunk, R of this entity sunk) | ||||||
|             return 0 |             return 0 | ||||||
|         elif g_act == IN: |         elif g_act == IN: | ||||||
|             # I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk) |             # I, Gold I --> True | ||||||
|  |             # (label forced by prev, if mismatch, P and R both sunk) | ||||||
|             return 0 |             return 0 | ||||||
|         elif g_act == LAST: |         elif g_act == LAST: | ||||||
|             # I, Gold L --> True iff this entity sunk and next tag == O |             # I, Gold L --> True iff this entity sunk and next tag == O | ||||||
|  | @ -505,11 +501,3 @@ cdef class Out: | ||||||
|             return 1 |             return 1 | ||||||
|         else: |         else: | ||||||
|             return 1 |             return 1 | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class OracleError(Exception): |  | ||||||
|     pass |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class UnknownMove(Exception): |  | ||||||
|     pass |  | ||||||
|  |  | ||||||
|  | @ -5,71 +5,48 @@ | ||||||
| # coding: utf-8 | # coding: utf-8 | ||||||
| from __future__ import unicode_literals, print_function | from __future__ import unicode_literals, print_function | ||||||
| 
 | 
 | ||||||
| from collections import Counter, OrderedDict | from collections import OrderedDict | ||||||
| import ujson | import ujson | ||||||
| import json | import json | ||||||
| import contextlib |  | ||||||
| import numpy | import numpy | ||||||
| 
 |  | ||||||
| from libc.math cimport exp |  | ||||||
| cimport cython |  | ||||||
| cimport cython.parallel | cimport cython.parallel | ||||||
| import cytoolz | import cytoolz | ||||||
| import dill |  | ||||||
| 
 |  | ||||||
| import numpy.random | import numpy.random | ||||||
| cimport numpy as np | cimport numpy as np | ||||||
| 
 | from cpython.ref cimport PyObject, Py_XDECREF | ||||||
| from libcpp.vector cimport vector |  | ||||||
| from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF |  | ||||||
| from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno | from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno | ||||||
| from libc.stdint cimport uint32_t, uint64_t | from libc.math cimport exp | ||||||
| from libc.string cimport memset, memcpy | from libcpp.vector cimport vector | ||||||
| from libc.stdlib cimport malloc, calloc, free | from libc.string cimport memset | ||||||
| from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t | from libc.stdlib cimport calloc, free | ||||||
| from thinc.linear.avgtron cimport AveragedPerceptron | from cymem.cymem cimport Pool | ||||||
| from thinc.linalg cimport Vec, VecVec | from thinc.typedefs cimport weight_t, class_t, hash_t | ||||||
| from thinc.structs cimport SparseArrayC, FeatureC, ExampleC |  | ||||||
| from thinc.extra.eg cimport Example |  | ||||||
| from thinc.extra.search cimport Beam | from thinc.extra.search cimport Beam | ||||||
| 
 | from thinc.api import chain, clone | ||||||
| from cymem.cymem cimport Pool, Address | from thinc.v2v import Model, Maxout, Affine | ||||||
| from murmurhash.mrmr cimport hash64 |  | ||||||
| from preshed.maps cimport MapStruct |  | ||||||
| from preshed.maps cimport map_get |  | ||||||
| 
 |  | ||||||
| from thinc.api import layerize, chain, clone, with_flatten |  | ||||||
| from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU |  | ||||||
| from thinc.misc import LayerNorm | from thinc.misc import LayerNorm | ||||||
| 
 | from thinc.neural.ops import CupyOps | ||||||
| from thinc.neural.ops import NumpyOps, CupyOps |  | ||||||
| from thinc.neural.util import get_array_module | from thinc.neural.util import get_array_module | ||||||
| 
 | 
 | ||||||
| from .. import util | from .._ml import zero_init, PrecomputableMaxouts, Tok2Vec, flatten | ||||||
| from ..util import get_async, get_cuda_stream |  | ||||||
| from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts |  | ||||||
| from .._ml import Tok2Vec, doc2feats, rebatch |  | ||||||
| from .._ml import Residual, flatten |  | ||||||
| from .._ml import link_vectors_to_models | from .._ml import link_vectors_to_models | ||||||
| from ..compat import json_dumps, copy_array | from ..compat import json_dumps, copy_array | ||||||
| 
 | from ..tokens.doc cimport Doc | ||||||
|  | from ..gold cimport GoldParse | ||||||
|  | from .. import util | ||||||
| from .stateclass cimport StateClass | from .stateclass cimport StateClass | ||||||
| from ._state cimport StateC | from ._state cimport StateC | ||||||
| from . import nonproj | from .transition_system cimport Transition | ||||||
| from .transition_system import OracleError | from . import _beam_utils, nonproj | ||||||
| from .transition_system cimport TransitionSystem, Transition |  | ||||||
| from ..structs cimport TokenC |  | ||||||
| from ..tokens.doc cimport Doc |  | ||||||
| from ..strings cimport StringStore |  | ||||||
| from ..gold cimport GoldParse |  | ||||||
| from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG |  | ||||||
| from . import _beam_utils |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_templates(*args, **kwargs): | def get_templates(*args, **kwargs): | ||||||
|     return [] |     return [] | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| DEBUG = False | DEBUG = False | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def set_debug(val): | def set_debug(val): | ||||||
|     global DEBUG |     global DEBUG | ||||||
|     DEBUG = val |     DEBUG = val | ||||||
|  | @ -100,7 +77,8 @@ cdef class precompute_hiddens: | ||||||
|     cdef object _cuda_stream |     cdef object _cuda_stream | ||||||
|     cdef object _bp_hiddens |     cdef object _bp_hiddens | ||||||
| 
 | 
 | ||||||
|     def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None, drop=0.): |     def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None, | ||||||
|  |                  drop=0.): | ||||||
|         gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop) |         gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop) | ||||||
|         cdef np.ndarray cached |         cdef np.ndarray cached | ||||||
|         if not isinstance(gpu_cached, numpy.ndarray): |         if not isinstance(gpu_cached, numpy.ndarray): | ||||||
|  | @ -120,8 +98,7 @@ cdef class precompute_hiddens: | ||||||
|         self._bp_hiddens = bp_features |         self._bp_hiddens = bp_features | ||||||
| 
 | 
 | ||||||
|     cdef const float* get_feat_weights(self) except NULL: |     cdef const float* get_feat_weights(self) except NULL: | ||||||
|         if not self._is_synchronized \ |         if not self._is_synchronized and self._cuda_stream is not None: | ||||||
|         and self._cuda_stream is not None: |  | ||||||
|             self._cuda_stream.synchronize() |             self._cuda_stream.synchronize() | ||||||
|             self._is_synchronized = True |             self._is_synchronized = True | ||||||
|         return <float*>self._cached.data |         return <float*>self._cached.data | ||||||
|  | @ -130,7 +107,8 @@ cdef class precompute_hiddens: | ||||||
|         return self.begin_update(X)[0] |         return self.begin_update(X)[0] | ||||||
| 
 | 
 | ||||||
|     def begin_update(self, token_ids, drop=0.): |     def begin_update(self, token_ids, drop=0.): | ||||||
|         cdef np.ndarray state_vector = numpy.zeros((token_ids.shape[0], self.nO*self.nP), dtype='f') |         cdef np.ndarray state_vector = numpy.zeros( | ||||||
|  |             (token_ids.shape[0], self.nO*self.nP), dtype='f') | ||||||
|         # This is tricky, but (assuming GPU available); |         # This is tricky, but (assuming GPU available); | ||||||
|         # - Input to forward on CPU |         # - Input to forward on CPU | ||||||
|         # - Output from forward on CPU |         # - Output from forward on CPU | ||||||
|  | @ -141,8 +119,8 @@ cdef class precompute_hiddens: | ||||||
|         feat_weights = self.get_feat_weights() |         feat_weights = self.get_feat_weights() | ||||||
|         cdef int[:, ::1] ids = token_ids |         cdef int[:, ::1] ids = token_ids | ||||||
|         sum_state_features(<float*>state_vector.data, |         sum_state_features(<float*>state_vector.data, | ||||||
|             feat_weights, &ids[0,0], |                            feat_weights, &ids[0, 0], | ||||||
|             token_ids.shape[0], self.nF, self.nO*self.nP) |                            token_ids.shape[0], self.nF, self.nO*self.nP) | ||||||
|         state_vector, bp_nonlinearity = self._nonlinearity(state_vector) |         state_vector, bp_nonlinearity = self._nonlinearity(state_vector) | ||||||
| 
 | 
 | ||||||
|         def backward(d_state_vector, sgd=None): |         def backward(d_state_vector, sgd=None): | ||||||
|  | @ -161,10 +139,11 @@ cdef class precompute_hiddens: | ||||||
|         state_vector = state_vector.reshape( |         state_vector = state_vector.reshape( | ||||||
|             (state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP)) |             (state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP)) | ||||||
|         best, which = self.ops.maxout(state_vector) |         best, which = self.ops.maxout(state_vector) | ||||||
|  | 
 | ||||||
|         def backprop(d_best, sgd=None): |         def backprop(d_best, sgd=None): | ||||||
|             return self.ops.backprop_maxout(d_best, which, self.nP) |             return self.ops.backprop_maxout(d_best, which, self.nP) | ||||||
|         return best, backprop |  | ||||||
| 
 | 
 | ||||||
|  |         return best, backprop | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef void sum_state_features(float* output, | cdef void sum_state_features(float* output, | ||||||
|  | @ -239,11 +218,15 @@ cdef class Parser: | ||||||
|         depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1)) |         depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1)) | ||||||
|         if depth != 1: |         if depth != 1: | ||||||
|             raise ValueError("Currently parser depth is hard-coded to 1.") |             raise ValueError("Currently parser depth is hard-coded to 1.") | ||||||
|         parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2)) |         parser_maxout_pieces = util.env_opt('parser_maxout_pieces', | ||||||
|  |                                             cfg.get('maxout_pieces', 2)) | ||||||
|         if parser_maxout_pieces != 2: |         if parser_maxout_pieces != 2: | ||||||
|             raise ValueError("Currently parser_maxout_pieces is hard-coded to 2") |             raise ValueError("Currently parser_maxout_pieces is hard-coded " | ||||||
|         token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128)) |                              "to 2") | ||||||
|         hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 200)) |         token_vector_width = util.env_opt('token_vector_width', | ||||||
|  |                                           cfg.get('token_vector_width', 128)) | ||||||
|  |         hidden_width = util.env_opt('hidden_width', | ||||||
|  |                                     cfg.get('hidden_width', 200)) | ||||||
|         embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000)) |         embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000)) | ||||||
|         hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0)) |         hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0)) | ||||||
|         hist_width = util.env_opt('history_width', cfg.get('hist_width', 0)) |         hist_width = util.env_opt('history_width', cfg.get('hist_width', 0)) | ||||||
|  | @ -365,8 +348,8 @@ cdef class Parser: | ||||||
|                     parse_states = self.parse_batch(subbatch) |                     parse_states = self.parse_batch(subbatch) | ||||||
|                     beams = [] |                     beams = [] | ||||||
|                 else: |                 else: | ||||||
|                     beams = self.beam_parse(subbatch, |                     beams = self.beam_parse(subbatch, beam_width=beam_width, | ||||||
|                                 beam_width=beam_width, beam_density=beam_density) |                                             beam_density=beam_density) | ||||||
|                     parse_states = [] |                     parse_states = [] | ||||||
|                     for beam in beams: |                     for beam in beams: | ||||||
|                         parse_states.append(<StateClass>beam.at(0)) |                         parse_states.append(<StateClass>beam.at(0)) | ||||||
|  | @ -386,9 +369,9 @@ cdef class Parser: | ||||||
|         if isinstance(docs, Doc): |         if isinstance(docs, Doc): | ||||||
|             docs = [docs] |             docs = [docs] | ||||||
| 
 | 
 | ||||||
|         cuda_stream = get_cuda_stream() |         cuda_stream = util.get_cuda_stream() | ||||||
|         (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, |         (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model( | ||||||
|                                                                             0.0) |             docs, cuda_stream, 0.0) | ||||||
|         nr_state = len(docs) |         nr_state = len(docs) | ||||||
|         nr_class = self.moves.n_moves |         nr_class = self.moves.n_moves | ||||||
|         nr_dim = tokvecs.shape[1] |         nr_dim = tokvecs.shape[1] | ||||||
|  | @ -402,7 +385,8 @@ cdef class Parser: | ||||||
| 
 | 
 | ||||||
|         feat_weights = state2vec.get_feat_weights() |         feat_weights = state2vec.get_feat_weights() | ||||||
|         cdef int i |         cdef int i | ||||||
|         cdef np.ndarray hidden_weights = numpy.ascontiguousarray(vec2scores._layers[-1].W.T) |         cdef np.ndarray hidden_weights = numpy.ascontiguousarray( | ||||||
|  |             vec2scores._layers[-1].W.T) | ||||||
|         cdef np.ndarray hidden_bias = vec2scores._layers[-1].b |         cdef np.ndarray hidden_bias = vec2scores._layers[-1].b | ||||||
| 
 | 
 | ||||||
|         hW = <float*>hidden_weights.data |         hW = <float*>hidden_weights.data | ||||||
|  | @ -462,9 +446,9 @@ cdef class Parser: | ||||||
|         cdef Doc doc |         cdef Doc doc | ||||||
|         cdef int nr_class = self.moves.n_moves |         cdef int nr_class = self.moves.n_moves | ||||||
|         cdef StateClass stcls, output |         cdef StateClass stcls, output | ||||||
|         cuda_stream = get_cuda_stream() |         cuda_stream = util.get_cuda_stream() | ||||||
|         (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, |         (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model( | ||||||
|                                                                             0.0) |             docs, cuda_stream, 0.0) | ||||||
|         beams = [] |         beams = [] | ||||||
|         cdef int offset = 0 |         cdef int offset = 0 | ||||||
|         cdef int j = 0 |         cdef int j = 0 | ||||||
|  | @ -519,9 +503,7 @@ cdef class Parser: | ||||||
|         if isinstance(docs, Doc) and isinstance(golds, GoldParse): |         if isinstance(docs, Doc) and isinstance(golds, GoldParse): | ||||||
|             docs = [docs] |             docs = [docs] | ||||||
|             golds = [golds] |             golds = [golds] | ||||||
| 
 |         cuda_stream = util.get_cuda_stream() | ||||||
|         cuda_stream = get_cuda_stream() |  | ||||||
| 
 |  | ||||||
|         states, golds, max_steps = self._init_gold_batch(docs, golds) |         states, golds, max_steps = self._init_gold_batch(docs, golds) | ||||||
|         (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, |         (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, | ||||||
|                                                                             drop) |                                                                             drop) | ||||||
|  | @ -536,7 +518,6 @@ cdef class Parser: | ||||||
|         n_steps = 0 |         n_steps = 0 | ||||||
|         while todo: |         while todo: | ||||||
|             states, golds = zip(*todo) |             states, golds = zip(*todo) | ||||||
| 
 |  | ||||||
|             token_ids = self.get_token_ids(states) |             token_ids = self.get_token_ids(states) | ||||||
|             vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0) |             vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0) | ||||||
|             if drop != 0: |             if drop != 0: | ||||||
|  | @ -558,8 +539,8 @@ cdef class Parser: | ||||||
|             and not isinstance(token_ids, state2vec.ops.xp.ndarray): |             and not isinstance(token_ids, state2vec.ops.xp.ndarray): | ||||||
|                 # Move token_ids and d_vector to GPU, asynchronously |                 # Move token_ids and d_vector to GPU, asynchronously | ||||||
|                 backprops.append(( |                 backprops.append(( | ||||||
|                     get_async(cuda_stream, token_ids), |                     util.get_async(cuda_stream, token_ids), | ||||||
|                     get_async(cuda_stream, d_vector), |                     util.get_async(cuda_stream, d_vector), | ||||||
|                     bp_vector |                     bp_vector | ||||||
|                 )) |                 )) | ||||||
|             else: |             else: | ||||||
|  | @ -592,15 +573,13 @@ cdef class Parser: | ||||||
|         states = self.moves.init_batch(docs) |         states = self.moves.init_batch(docs) | ||||||
|         for gold in golds: |         for gold in golds: | ||||||
|             self.moves.preprocess_gold(gold) |             self.moves.preprocess_gold(gold) | ||||||
| 
 |         cuda_stream = util.get_cuda_stream() | ||||||
|         cuda_stream = get_cuda_stream() |         (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model( | ||||||
|         (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, drop) |             docs, cuda_stream, drop) | ||||||
| 
 |         states_d_scores, backprops = _beam_utils.update_beam( | ||||||
|         states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500, |             self.moves, self.nr_feature, 500, states, golds, state2vec, | ||||||
|                                         states, golds, |             vec2scores, width, density, self.cfg.get('hist_size', 0), | ||||||
|                                         state2vec, vec2scores, |             drop=drop, losses=losses) | ||||||
|                                         width, density, self.cfg.get('hist_size', 0), |  | ||||||
|                                         drop=drop, losses=losses) |  | ||||||
|         backprop_lower = [] |         backprop_lower = [] | ||||||
|         cdef float batch_size = len(docs) |         cdef float batch_size = len(docs) | ||||||
|         for i, d_scores in enumerate(states_d_scores): |         for i, d_scores in enumerate(states_d_scores): | ||||||
|  | @ -612,13 +591,14 @@ cdef class Parser: | ||||||
|             if isinstance(self.model[0].ops, CupyOps) \ |             if isinstance(self.model[0].ops, CupyOps) \ | ||||||
|             and not isinstance(ids, state2vec.ops.xp.ndarray): |             and not isinstance(ids, state2vec.ops.xp.ndarray): | ||||||
|                 backprop_lower.append(( |                 backprop_lower.append(( | ||||||
|                     get_async(cuda_stream, ids), |                     util.get_async(cuda_stream, ids), | ||||||
|                     get_async(cuda_stream, d_vector), |                     util.get_async(cuda_stream, d_vector), | ||||||
|                     bp_vectors)) |                     bp_vectors)) | ||||||
|             else: |             else: | ||||||
|                 backprop_lower.append((ids, d_vector, bp_vectors)) |                 backprop_lower.append((ids, d_vector, bp_vectors)) | ||||||
|         d_tokvecs = self.model[0].ops.allocate(tokvecs.shape) |         d_tokvecs = self.model[0].ops.allocate(tokvecs.shape) | ||||||
|         self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd, cuda_stream) |         self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd, | ||||||
|  |                            cuda_stream) | ||||||
| 
 | 
 | ||||||
|     def _init_gold_batch(self, whole_docs, whole_golds): |     def _init_gold_batch(self, whole_docs, whole_golds): | ||||||
|         """Make a square batch, of length equal to the shortest doc. A long |         """Make a square batch, of length equal to the shortest doc. A long | ||||||
|  | @ -768,7 +748,8 @@ cdef class Parser: | ||||||
|     def begin_training(self, gold_tuples, pipeline=None, **cfg): |     def begin_training(self, gold_tuples, pipeline=None, **cfg): | ||||||
|         if 'model' in cfg: |         if 'model' in cfg: | ||||||
|             self.model = cfg['model'] |             self.model = cfg['model'] | ||||||
|         gold_tuples = nonproj.preprocess_training_data(gold_tuples, label_freq_cutoff=100) |         gold_tuples = nonproj.preprocess_training_data(gold_tuples, | ||||||
|  |                                                        label_freq_cutoff=100) | ||||||
|         actions = self.moves.get_actions(gold_parses=gold_tuples) |         actions = self.moves.get_actions(gold_parses=gold_tuples) | ||||||
|         for action, labels in actions.items(): |         for action, labels in actions.items(): | ||||||
|             for label in labels: |             for label in labels: | ||||||
|  |  | ||||||
|  | @ -1,39 +1,37 @@ | ||||||
| # coding: utf-8 | # coding: utf-8 | ||||||
| """ | """Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005 | ||||||
| Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005 |  | ||||||
| for doing pseudo-projective parsing implementation uses the HEAD decoration | for doing pseudo-projective parsing implementation uses the HEAD decoration | ||||||
| scheme. | scheme. | ||||||
| """ | """ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
| from copy import copy | from copy import copy | ||||||
| 
 | 
 | ||||||
| from ..tokens.doc cimport Doc |  | ||||||
| from ..attrs import DEP, HEAD |  | ||||||
| 
 | 
 | ||||||
| DELIMITER = '||' | DELIMITER = '||' | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def ancestors(tokenid, heads): | def ancestors(tokenid, heads): | ||||||
|     # returns all words going from the word up the path to the root |     # Returns all words going from the word up the path to the root. The path | ||||||
|     # the path to root cannot be longer than the number of words in the sentence |     # to root cannot be longer than the number of words in the  sentence. This | ||||||
|     # this function ends after at most len(heads) steps |     # function ends after at most len(heads) steps, because it would otherwise | ||||||
|     # because it would otherwise loop indefinitely on cycles |     # loop indefinitely on cycles. | ||||||
|     head = tokenid |     head = tokenid | ||||||
|     cnt = 0 |     cnt = 0 | ||||||
|     while heads[head] != head and cnt < len(heads): |     while heads[head] != head and cnt < len(heads): | ||||||
|         head = heads[head] |         head = heads[head] | ||||||
|         cnt += 1 |         cnt += 1 | ||||||
|         yield head |         yield head | ||||||
|         if head == None: |         if head is None: | ||||||
|             break |             break | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def contains_cycle(heads): | def contains_cycle(heads): | ||||||
|     # in an acyclic tree, the path from each word following |     # in an acyclic tree, the path from each word following the head relation | ||||||
|     # the head relation upwards always ends at the root node |     # upwards always ends at the root node | ||||||
|     for tokenid in range(len(heads)): |     for tokenid in range(len(heads)): | ||||||
|         seen = set([tokenid]) |         seen = set([tokenid]) | ||||||
|         for ancestor in ancestors(tokenid,heads): |         for ancestor in ancestors(tokenid, heads): | ||||||
|             if ancestor in seen: |             if ancestor in seen: | ||||||
|                 return seen |                 return seen | ||||||
|             seen.add(ancestor) |             seen.add(ancestor) | ||||||
|  | @ -45,26 +43,26 @@ def is_nonproj_arc(tokenid, heads): | ||||||
|     # if there is a token k, h < k < d such that h is not |     # if there is a token k, h < k < d such that h is not | ||||||
|     # an ancestor of k. Same for h -> d, h > d |     # an ancestor of k. Same for h -> d, h > d | ||||||
|     head = heads[tokenid] |     head = heads[tokenid] | ||||||
|     if head == tokenid: # root arcs cannot be non-projective |     if head == tokenid:  # root arcs cannot be non-projective | ||||||
|         return False |         return False | ||||||
|     elif head == None: # unattached tokens cannot be non-projective |     elif head is None:  # unattached tokens cannot be non-projective | ||||||
|         return False |         return False | ||||||
| 
 | 
 | ||||||
|     start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head) |     start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head) | ||||||
|     for k in range(start,end): |     for k in range(start, end): | ||||||
|         for ancestor in ancestors(k,heads): |         for ancestor in ancestors(k, heads): | ||||||
|             if ancestor == None: # for unattached tokens/subtrees |             if ancestor is None:  # for unattached tokens/subtrees | ||||||
|                 break |                 break | ||||||
|             elif ancestor == head: # normal case: k dominated by h |             elif ancestor == head:  # normal case: k dominated by h | ||||||
|                 break |                 break | ||||||
|         else: # head not in ancestors: d -> h is non-projective |         else:  # head not in ancestors: d -> h is non-projective | ||||||
|             return True |             return True | ||||||
|     return False |     return False | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def is_nonproj_tree(heads): | def is_nonproj_tree(heads): | ||||||
|     # a tree is non-projective if at least one arc is non-projective |     # a tree is non-projective if at least one arc is non-projective | ||||||
|     return any( is_nonproj_arc(word,heads) for word in range(len(heads)) ) |     return any(is_nonproj_arc(word, heads) for word in range(len(heads))) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def decompose(label): | def decompose(label): | ||||||
|  | @ -81,32 +79,32 @@ def preprocess_training_data(gold_tuples, label_freq_cutoff=30): | ||||||
|     for raw_text, sents in gold_tuples: |     for raw_text, sents in gold_tuples: | ||||||
|         prepro_sents = [] |         prepro_sents = [] | ||||||
|         for (ids, words, tags, heads, labels, iob), ctnts in sents: |         for (ids, words, tags, heads, labels, iob), ctnts in sents: | ||||||
|             proj_heads,deco_labels = projectivize(heads,labels) |             proj_heads, deco_labels = projectivize(heads, labels) | ||||||
|             # set the label to ROOT for each root dependent |             # set the label to ROOT for each root dependent | ||||||
|             deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ] |             deco_labels = ['ROOT' if head == i else deco_labels[i] | ||||||
|  |                            for i, head in enumerate(proj_heads)] | ||||||
|             # count label frequencies |             # count label frequencies | ||||||
|             if label_freq_cutoff > 0: |             if label_freq_cutoff > 0: | ||||||
|                 for label in deco_labels: |                 for label in deco_labels: | ||||||
|                     if is_decorated(label): |                     if is_decorated(label): | ||||||
|                         freqs[label] = freqs.get(label,0) + 1 |                         freqs[label] = freqs.get(label, 0) + 1 | ||||||
|             prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts)) |             prepro_sents.append( | ||||||
|  |                 ((ids, words, tags, proj_heads, deco_labels, iob), ctnts)) | ||||||
|         preprocessed.append((raw_text, prepro_sents)) |         preprocessed.append((raw_text, prepro_sents)) | ||||||
| 
 |  | ||||||
|     if label_freq_cutoff > 0: |     if label_freq_cutoff > 0: | ||||||
|         return _filter_labels(preprocessed,label_freq_cutoff,freqs) |         return _filter_labels(preprocessed, label_freq_cutoff, freqs) | ||||||
|     return preprocessed |     return preprocessed | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def projectivize(heads, labels): | def projectivize(heads, labels): | ||||||
|     # use the algorithm by Nivre & Nilsson 2005 |     # Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper | ||||||
|     # assumes heads to be a proper tree, i.e. connected and cycle-free |     # tree, i.e. connected and cycle-free. Returns a new pair (heads, labels) | ||||||
|     # returns a new pair (heads,labels) which encode |     # which encode a projective and decorated tree. | ||||||
|     # a projective and decorated tree |  | ||||||
|     proj_heads = copy(heads) |     proj_heads = copy(heads) | ||||||
|     smallest_np_arc = _get_smallest_nonproj_arc(proj_heads) |     smallest_np_arc = _get_smallest_nonproj_arc(proj_heads) | ||||||
|     if smallest_np_arc == None: # this sentence is already projective |     if smallest_np_arc is None:  # this sentence is already projective | ||||||
|         return proj_heads, copy(labels) |         return proj_heads, copy(labels) | ||||||
|     while smallest_np_arc != None: |     while smallest_np_arc is not None: | ||||||
|         _lift(smallest_np_arc, proj_heads) |         _lift(smallest_np_arc, proj_heads) | ||||||
|         smallest_np_arc = _get_smallest_nonproj_arc(proj_heads) |         smallest_np_arc = _get_smallest_nonproj_arc(proj_heads) | ||||||
|     deco_labels = _decorate(heads, proj_heads, labels) |     deco_labels = _decorate(heads, proj_heads, labels) | ||||||
|  | @ -114,24 +112,26 @@ def projectivize(heads, labels): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def deprojectivize(tokens): | def deprojectivize(tokens): | ||||||
|     # reattach arcs with decorated labels (following HEAD scheme) |     # Reattach arcs with decorated labels (following HEAD scheme). For each | ||||||
|     # for each decorated arc X||Y, search top-down, left-to-right, |     # decorated arc X||Y, search top-down, left-to-right, breadth-first until | ||||||
|     # breadth-first until hitting a Y then make this the new head |     # hitting a Y then make this the new head. | ||||||
|     for token in tokens: |     for token in tokens: | ||||||
|         if is_decorated(token.dep_): |         if is_decorated(token.dep_): | ||||||
|             newlabel,headlabel = decompose(token.dep_) |             newlabel, headlabel = decompose(token.dep_) | ||||||
|             newhead = _find_new_head(token,headlabel) |             newhead = _find_new_head(token, headlabel) | ||||||
|             token.head = newhead |             token.head = newhead | ||||||
|             token.dep_ = newlabel |             token.dep_ = newlabel | ||||||
|     return tokens |     return tokens | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def _decorate(heads, proj_heads, labels): | def _decorate(heads, proj_heads, labels): | ||||||
|     # uses decoration scheme HEAD from Nivre & Nilsson 2005 |     # uses decoration scheme HEAD from Nivre & Nilsson 2005 | ||||||
|     assert(len(heads) == len(proj_heads) == len(labels)) |     assert(len(heads) == len(proj_heads) == len(labels)) | ||||||
|     deco_labels = [] |     deco_labels = [] | ||||||
|     for tokenid,head in enumerate(heads): |     for tokenid, head in enumerate(heads): | ||||||
|         if head != proj_heads[tokenid]: |         if head != proj_heads[tokenid]: | ||||||
|             deco_labels.append('%s%s%s' % (labels[tokenid], DELIMITER, labels[head])) |             deco_labels.append( | ||||||
|  |                 '%s%s%s' % (labels[tokenid], DELIMITER, labels[head])) | ||||||
|         else: |         else: | ||||||
|             deco_labels.append(labels[tokenid]) |             deco_labels.append(labels[tokenid]) | ||||||
|     return deco_labels |     return deco_labels | ||||||
|  | @ -143,9 +143,9 @@ def _get_smallest_nonproj_arc(heads): | ||||||
|     # and ties are broken left to right |     # and ties are broken left to right | ||||||
|     smallest_size = float('inf') |     smallest_size = float('inf') | ||||||
|     smallest_np_arc = None |     smallest_np_arc = None | ||||||
|     for tokenid,head in enumerate(heads): |     for tokenid, head in enumerate(heads): | ||||||
|         size = abs(tokenid-head) |         size = abs(tokenid-head) | ||||||
|         if size < smallest_size and is_nonproj_arc(tokenid,heads): |         if size < smallest_size and is_nonproj_arc(tokenid, heads): | ||||||
|             smallest_size = size |             smallest_size = size | ||||||
|             smallest_np_arc = tokenid |             smallest_np_arc = tokenid | ||||||
|     return smallest_np_arc |     return smallest_np_arc | ||||||
|  | @ -168,8 +168,10 @@ def _find_new_head(token, headlabel): | ||||||
|         next_queue = [] |         next_queue = [] | ||||||
|         for qtoken in queue: |         for qtoken in queue: | ||||||
|             for child in qtoken.children: |             for child in qtoken.children: | ||||||
|                 if child.is_space: continue |                 if child.is_space: | ||||||
|                 if child == token: continue |                     continue | ||||||
|  |                 if child == token: | ||||||
|  |                     continue | ||||||
|                 if child.dep_ == headlabel: |                 if child.dep_ == headlabel: | ||||||
|                     return child |                     return child | ||||||
|                 next_queue.append(child) |                 next_queue.append(child) | ||||||
|  | @ -184,7 +186,10 @@ def _filter_labels(gold_tuples, cutoff, freqs): | ||||||
|     for raw_text, sents in gold_tuples: |     for raw_text, sents in gold_tuples: | ||||||
|         filtered_sents = [] |         filtered_sents = [] | ||||||
|         for (ids, words, tags, heads, labels, iob), ctnts in sents: |         for (ids, words, tags, heads, labels, iob), ctnts in sents: | ||||||
|             filtered_labels = [ decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ] |             filtered_labels = [decompose(label)[0] | ||||||
|             filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts)) |                                if freqs.get(label, cutoff) < cutoff | ||||||
|  |                                else label for label in labels] | ||||||
|  |             filtered_sents.append( | ||||||
|  |                 ((ids, words, tags, heads, filtered_labels, iob), ctnts)) | ||||||
|         filtered.append((raw_text, filtered_sents)) |         filtered.append((raw_text, filtered_sents)) | ||||||
|     return filtered |     return filtered | ||||||
|  |  | ||||||
|  | @ -2,17 +2,8 @@ | ||||||
| # cython: infer_types=True | # cython: infer_types=True | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from libc.string cimport memcpy, memset |  | ||||||
| from libc.stdint cimport uint32_t, uint64_t |  | ||||||
| import numpy | import numpy | ||||||
| 
 | 
 | ||||||
| from ..vocab cimport EMPTY_LEXEME |  | ||||||
| from ..structs cimport Entity |  | ||||||
| from ..lexeme cimport Lexeme |  | ||||||
| from ..symbols cimport punct |  | ||||||
| from ..attrs cimport IS_SPACE |  | ||||||
| from ..attrs cimport attr_id_t |  | ||||||
| from ..tokens.token cimport Token |  | ||||||
| from ..tokens.doc cimport Doc | from ..tokens.doc cimport Doc | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -2,17 +2,17 @@ | ||||||
| # coding: utf-8 | # coding: utf-8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF | from cpython.ref cimport Py_INCREF | ||||||
| from cymem.cymem cimport Pool | from cymem.cymem cimport Pool | ||||||
| from thinc.typedefs cimport weight_t | from thinc.typedefs cimport weight_t | ||||||
| from collections import defaultdict, OrderedDict | from collections import OrderedDict | ||||||
| import ujson | import ujson | ||||||
| 
 | 
 | ||||||
| from .. import util |  | ||||||
| from ..structs cimport TokenC | from ..structs cimport TokenC | ||||||
| from .stateclass cimport StateClass | from .stateclass cimport StateClass | ||||||
| from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB |  | ||||||
| from ..typedefs cimport attr_t | from ..typedefs cimport attr_t | ||||||
|  | from ..compat import json_dumps | ||||||
|  | from .. import util | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef weight_t MIN_SCORE = -90000 | cdef weight_t MIN_SCORE = -90000 | ||||||
|  | @ -136,11 +136,12 @@ cdef class TransitionSystem: | ||||||
|             print([gold.c.ner[i].clas for i in range(gold.length)]) |             print([gold.c.ner[i].clas for i in range(gold.length)]) | ||||||
|             print([gold.c.ner[i].move for i in range(gold.length)]) |             print([gold.c.ner[i].move for i in range(gold.length)]) | ||||||
|             print([gold.c.ner[i].label for i in range(gold.length)]) |             print([gold.c.ner[i].label for i in range(gold.length)]) | ||||||
|             print("Self labels", [self.c[i].label for i in range(self.n_moves)]) |             print("Self labels", | ||||||
|  |                   [self.c[i].label for i in range(self.n_moves)]) | ||||||
|             raise ValueError( |             raise ValueError( | ||||||
|                 "Could not find a gold-standard action to supervise " |                 "Could not find a gold-standard action to supervise " | ||||||
|                 "the entity recognizer\n" |                 "the entity recognizer. The transition system has " | ||||||
|                 "The transition system has %d actions." % (self.n_moves)) |                 "%d actions." % (self.n_moves)) | ||||||
| 
 | 
 | ||||||
|     def get_class_name(self, int clas): |     def get_class_name(self, int clas): | ||||||
|         act = self.c[clas] |         act = self.c[clas] | ||||||
|  | @ -149,7 +150,7 @@ cdef class TransitionSystem: | ||||||
|     def add_action(self, int action, label_name): |     def add_action(self, int action, label_name): | ||||||
|         cdef attr_t label_id |         cdef attr_t label_id | ||||||
|         if not isinstance(label_name, int) and \ |         if not isinstance(label_name, int) and \ | ||||||
|         not isinstance(label_name, long): |            not isinstance(label_name, long): | ||||||
|             label_id = self.strings.add(label_name) |             label_id = self.strings.add(label_name) | ||||||
|         else: |         else: | ||||||
|             label_id = label_name |             label_id = label_name | ||||||
|  | @ -186,7 +187,7 @@ cdef class TransitionSystem: | ||||||
|                 'name': self.move_name(trans.move, trans.label) |                 'name': self.move_name(trans.move, trans.label) | ||||||
|             }) |             }) | ||||||
|         serializers = { |         serializers = { | ||||||
|             'transitions': lambda: ujson.dumps(transitions), |             'transitions': lambda: json_dumps(transitions), | ||||||
|             'strings': lambda: self.strings.to_bytes() |             'strings': lambda: self.strings.to_bytes() | ||||||
|         } |         } | ||||||
|         return util.to_bytes(serializers, exclude) |         return util.to_bytes(serializers, exclude) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user