mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-01 00:17:44 +03:00 
			
		
		
		
	Remove obsolete parser.pyx
This commit is contained in:
		
							parent
							
								
									a8abc47811
								
							
						
					
					
						commit
						33f8c58782
					
				|  | @ -1,259 +0,0 @@ | ||||||
| from thinc.typedefs cimport atom_t |  | ||||||
| 
 |  | ||||||
| from .stateclass cimport StateClass |  | ||||||
| from ._state cimport StateC |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef int fill_context(atom_t* context, const StateC* state) nogil |  | ||||||
| # Context elements |  | ||||||
| 
 |  | ||||||
| # Ensure each token's attributes are listed: w, p, c, c6, c4. The order |  | ||||||
| # is referenced by incrementing the enum... |  | ||||||
| 
 |  | ||||||
| # Tokens are listed in left-to-right order. |  | ||||||
| #cdef size_t* SLOTS = [ |  | ||||||
| #    S2w, S1w, |  | ||||||
| #    S0l0w, S0l2w, S0lw, |  | ||||||
| #    S0w, |  | ||||||
| #    S0r0w, S0r2w, S0rw, |  | ||||||
| #    N0l0w, N0l2w, N0lw, |  | ||||||
| #    P2w, P1w, |  | ||||||
| #    N0w, N1w, N2w, N3w, 0 |  | ||||||
| #] |  | ||||||
| 
 |  | ||||||
| # NB: The order of the enum is _NOT_ arbitrary!! |  | ||||||
| cpdef enum: |  | ||||||
|     S2w |  | ||||||
|     S2W |  | ||||||
|     S2p |  | ||||||
|     S2c |  | ||||||
|     S2c4 |  | ||||||
|     S2c6 |  | ||||||
|     S2L |  | ||||||
|     S2_prefix |  | ||||||
|     S2_suffix |  | ||||||
|     S2_shape |  | ||||||
|     S2_ne_iob |  | ||||||
|     S2_ne_type |  | ||||||
| 
 |  | ||||||
|     S1w |  | ||||||
|     S1W |  | ||||||
|     S1p |  | ||||||
|     S1c |  | ||||||
|     S1c4 |  | ||||||
|     S1c6 |  | ||||||
|     S1L |  | ||||||
|     S1_prefix |  | ||||||
|     S1_suffix |  | ||||||
|     S1_shape |  | ||||||
|     S1_ne_iob |  | ||||||
|     S1_ne_type |  | ||||||
| 
 |  | ||||||
|     S1rw |  | ||||||
|     S1rW |  | ||||||
|     S1rp |  | ||||||
|     S1rc |  | ||||||
|     S1rc4 |  | ||||||
|     S1rc6 |  | ||||||
|     S1rL |  | ||||||
|     S1r_prefix |  | ||||||
|     S1r_suffix |  | ||||||
|     S1r_shape |  | ||||||
|     S1r_ne_iob |  | ||||||
|     S1r_ne_type |  | ||||||
| 
 |  | ||||||
|     S0lw |  | ||||||
|     S0lW |  | ||||||
|     S0lp |  | ||||||
|     S0lc |  | ||||||
|     S0lc4 |  | ||||||
|     S0lc6 |  | ||||||
|     S0lL |  | ||||||
|     S0l_prefix |  | ||||||
|     S0l_suffix |  | ||||||
|     S0l_shape |  | ||||||
|     S0l_ne_iob |  | ||||||
|     S0l_ne_type |  | ||||||
| 
 |  | ||||||
|     S0l2w |  | ||||||
|     S0l2W |  | ||||||
|     S0l2p |  | ||||||
|     S0l2c |  | ||||||
|     S0l2c4 |  | ||||||
|     S0l2c6 |  | ||||||
|     S0l2L |  | ||||||
|     S0l2_prefix |  | ||||||
|     S0l2_suffix |  | ||||||
|     S0l2_shape |  | ||||||
|     S0l2_ne_iob |  | ||||||
|     S0l2_ne_type |  | ||||||
| 
 |  | ||||||
|     S0w |  | ||||||
|     S0W |  | ||||||
|     S0p |  | ||||||
|     S0c |  | ||||||
|     S0c4 |  | ||||||
|     S0c6 |  | ||||||
|     S0L |  | ||||||
|     S0_prefix |  | ||||||
|     S0_suffix |  | ||||||
|     S0_shape |  | ||||||
|     S0_ne_iob |  | ||||||
|     S0_ne_type |  | ||||||
| 
 |  | ||||||
|     S0r2w |  | ||||||
|     S0r2W |  | ||||||
|     S0r2p |  | ||||||
|     S0r2c |  | ||||||
|     S0r2c4 |  | ||||||
|     S0r2c6 |  | ||||||
|     S0r2L |  | ||||||
|     S0r2_prefix |  | ||||||
|     S0r2_suffix |  | ||||||
|     S0r2_shape |  | ||||||
|     S0r2_ne_iob |  | ||||||
|     S0r2_ne_type |  | ||||||
| 
 |  | ||||||
|     S0rw |  | ||||||
|     S0rW |  | ||||||
|     S0rp |  | ||||||
|     S0rc |  | ||||||
|     S0rc4 |  | ||||||
|     S0rc6 |  | ||||||
|     S0rL |  | ||||||
|     S0r_prefix |  | ||||||
|     S0r_suffix |  | ||||||
|     S0r_shape |  | ||||||
|     S0r_ne_iob |  | ||||||
|     S0r_ne_type |  | ||||||
| 
 |  | ||||||
|     N0l2w |  | ||||||
|     N0l2W |  | ||||||
|     N0l2p |  | ||||||
|     N0l2c |  | ||||||
|     N0l2c4 |  | ||||||
|     N0l2c6 |  | ||||||
|     N0l2L |  | ||||||
|     N0l2_prefix |  | ||||||
|     N0l2_suffix |  | ||||||
|     N0l2_shape |  | ||||||
|     N0l2_ne_iob |  | ||||||
|     N0l2_ne_type |  | ||||||
| 
 |  | ||||||
|     N0lw |  | ||||||
|     N0lW |  | ||||||
|     N0lp |  | ||||||
|     N0lc |  | ||||||
|     N0lc4 |  | ||||||
|     N0lc6 |  | ||||||
|     N0lL |  | ||||||
|     N0l_prefix |  | ||||||
|     N0l_suffix |  | ||||||
|     N0l_shape |  | ||||||
|     N0l_ne_iob |  | ||||||
|     N0l_ne_type |  | ||||||
| 
 |  | ||||||
|     N0w |  | ||||||
|     N0W |  | ||||||
|     N0p |  | ||||||
|     N0c |  | ||||||
|     N0c4 |  | ||||||
|     N0c6 |  | ||||||
|     N0L |  | ||||||
|     N0_prefix |  | ||||||
|     N0_suffix |  | ||||||
|     N0_shape |  | ||||||
|     N0_ne_iob |  | ||||||
|     N0_ne_type |  | ||||||
| 
 |  | ||||||
|     N1w |  | ||||||
|     N1W |  | ||||||
|     N1p |  | ||||||
|     N1c |  | ||||||
|     N1c4 |  | ||||||
|     N1c6 |  | ||||||
|     N1L |  | ||||||
|     N1_prefix |  | ||||||
|     N1_suffix |  | ||||||
|     N1_shape |  | ||||||
|     N1_ne_iob |  | ||||||
|     N1_ne_type |  | ||||||
| 
 |  | ||||||
|     N2w |  | ||||||
|     N2W |  | ||||||
|     N2p |  | ||||||
|     N2c |  | ||||||
|     N2c4 |  | ||||||
|     N2c6 |  | ||||||
|     N2L |  | ||||||
|     N2_prefix |  | ||||||
|     N2_suffix |  | ||||||
|     N2_shape |  | ||||||
|     N2_ne_iob |  | ||||||
|     N2_ne_type |  | ||||||
| 
 |  | ||||||
|     P1w |  | ||||||
|     P1W |  | ||||||
|     P1p |  | ||||||
|     P1c |  | ||||||
|     P1c4 |  | ||||||
|     P1c6 |  | ||||||
|     P1L |  | ||||||
|     P1_prefix |  | ||||||
|     P1_suffix |  | ||||||
|     P1_shape |  | ||||||
|     P1_ne_iob |  | ||||||
|     P1_ne_type |  | ||||||
| 
 |  | ||||||
|     P2w |  | ||||||
|     P2W |  | ||||||
|     P2p |  | ||||||
|     P2c |  | ||||||
|     P2c4 |  | ||||||
|     P2c6 |  | ||||||
|     P2L |  | ||||||
|     P2_prefix |  | ||||||
|     P2_suffix |  | ||||||
|     P2_shape |  | ||||||
|     P2_ne_iob |  | ||||||
|     P2_ne_type |  | ||||||
| 
 |  | ||||||
|     E0w |  | ||||||
|     E0W |  | ||||||
|     E0p |  | ||||||
|     E0c |  | ||||||
|     E0c4 |  | ||||||
|     E0c6 |  | ||||||
|     E0L |  | ||||||
|     E0_prefix |  | ||||||
|     E0_suffix |  | ||||||
|     E0_shape |  | ||||||
|     E0_ne_iob |  | ||||||
|     E0_ne_type |  | ||||||
| 
 |  | ||||||
|     E1w |  | ||||||
|     E1W |  | ||||||
|     E1p |  | ||||||
|     E1c |  | ||||||
|     E1c4 |  | ||||||
|     E1c6 |  | ||||||
|     E1L |  | ||||||
|     E1_prefix |  | ||||||
|     E1_suffix |  | ||||||
|     E1_shape |  | ||||||
|     E1_ne_iob |  | ||||||
|     E1_ne_type |  | ||||||
| 
 |  | ||||||
|     # Misc features at the end |  | ||||||
|     dist |  | ||||||
|     N0lv |  | ||||||
|     S0lv |  | ||||||
|     S0rv |  | ||||||
|     S1lv |  | ||||||
|     S1rv |  | ||||||
| 
 |  | ||||||
|     S0_has_head |  | ||||||
|     S1_has_head |  | ||||||
|     S2_has_head |  | ||||||
| 
 |  | ||||||
|     CONTEXT_SIZE |  | ||||||
|  | @ -1,419 +0,0 @@ | ||||||
| """ |  | ||||||
| Fill an array, context, with every _atomic_ value our features reference. |  | ||||||
| We then write the _actual features_ as tuples of the atoms. The machinery |  | ||||||
| that translates from the tuples to feature-extractors (which pick the values |  | ||||||
| out of "context") is in features/extractor.pyx |  | ||||||
| 
 |  | ||||||
| The atomic feature names are listed in a big enum, so that the feature tuples |  | ||||||
| can refer to them. |  | ||||||
| """ |  | ||||||
| # coding: utf-8 |  | ||||||
| from __future__ import unicode_literals |  | ||||||
| 
 |  | ||||||
| from libc.string cimport memset |  | ||||||
| from itertools import combinations |  | ||||||
| from cymem.cymem cimport Pool |  | ||||||
| 
 |  | ||||||
| from ..structs cimport TokenC |  | ||||||
| from .stateclass cimport StateClass |  | ||||||
| from ._state cimport StateC |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef inline void fill_token(atom_t* context, const TokenC* token) nogil: |  | ||||||
|     if token is NULL: |  | ||||||
|         context[0] = 0 |  | ||||||
|         context[1] = 0 |  | ||||||
|         context[2] = 0 |  | ||||||
|         context[3] = 0 |  | ||||||
|         context[4] = 0 |  | ||||||
|         context[5] = 0 |  | ||||||
|         context[6] = 0 |  | ||||||
|         context[7] = 0 |  | ||||||
|         context[8] = 0 |  | ||||||
|         context[9] = 0 |  | ||||||
|         context[10] = 0 |  | ||||||
|         context[11] = 0 |  | ||||||
|     else: |  | ||||||
|         context[0] = token.lex.orth |  | ||||||
|         context[1] = token.lemma |  | ||||||
|         context[2] = token.tag |  | ||||||
|         context[3] = token.lex.cluster |  | ||||||
|         # We've read in the string little-endian, so now we can take & (2**n)-1 |  | ||||||
|         # to get the first n bits of the cluster. |  | ||||||
|         # e.g. s = "1110010101" |  | ||||||
|         # s = ''.join(reversed(s)) |  | ||||||
|         # first_4_bits = int(s, 2) |  | ||||||
|         # print first_4_bits |  | ||||||
|         # 5 |  | ||||||
|         # print "{0:b}".format(prefix).ljust(4, '0') |  | ||||||
|         # 1110 |  | ||||||
|         # What we're doing here is picking a number where all bits are 1, e.g. |  | ||||||
|         # 15 is 1111, 63 is 111111 and doing bitwise AND, so getting all bits in |  | ||||||
|         # the source that are set to 1. |  | ||||||
|         context[4] = token.lex.cluster & 15 |  | ||||||
|         context[5] = token.lex.cluster & 63 |  | ||||||
|         context[6] = token.dep if token.head != 0 else 0 |  | ||||||
|         context[7] = token.lex.prefix |  | ||||||
|         context[8] = token.lex.suffix |  | ||||||
|         context[9] = token.lex.shape |  | ||||||
|         context[10] = token.ent_iob |  | ||||||
|         context[11] = token.ent_type |  | ||||||
| 
 |  | ||||||
| cdef int fill_context(atom_t* ctxt, const StateC* st) nogil: |  | ||||||
|     # Take care to fill every element of context! |  | ||||||
|     # We could memset, but this makes it very easy to have broken features that |  | ||||||
|     # make almost no impact on accuracy. If instead they're unset, the impact |  | ||||||
|     # tends to be dramatic, so we get an obvious regression to fix... |  | ||||||
|     fill_token(&ctxt[S2w], st.S_(2)) |  | ||||||
|     fill_token(&ctxt[S1w], st.S_(1)) |  | ||||||
|     fill_token(&ctxt[S1rw], st.R_(st.S(1), 1)) |  | ||||||
|     fill_token(&ctxt[S0lw], st.L_(st.S(0), 1)) |  | ||||||
|     fill_token(&ctxt[S0l2w], st.L_(st.S(0), 2)) |  | ||||||
|     fill_token(&ctxt[S0w], st.S_(0)) |  | ||||||
|     fill_token(&ctxt[S0r2w], st.R_(st.S(0), 2)) |  | ||||||
|     fill_token(&ctxt[S0rw], st.R_(st.S(0), 1)) |  | ||||||
|     fill_token(&ctxt[N0lw], st.L_(st.B(0), 1)) |  | ||||||
|     fill_token(&ctxt[N0l2w], st.L_(st.B(0), 2)) |  | ||||||
|     fill_token(&ctxt[N0w], st.B_(0)) |  | ||||||
|     fill_token(&ctxt[N1w], st.B_(1)) |  | ||||||
|     fill_token(&ctxt[N2w], st.B_(2)) |  | ||||||
|     fill_token(&ctxt[P1w], st.safe_get(st.B(0)-1)) |  | ||||||
|     fill_token(&ctxt[P2w], st.safe_get(st.B(0)-2)) |  | ||||||
| 
 |  | ||||||
|     fill_token(&ctxt[E0w], st.E_(0)) |  | ||||||
|     fill_token(&ctxt[E1w], st.E_(1)) |  | ||||||
| 
 |  | ||||||
|     if st.stack_depth() >= 1 and not st.eol(): |  | ||||||
|         ctxt[dist] = min_(st.B(0) - st.E(0), 5) |  | ||||||
|     else: |  | ||||||
|         ctxt[dist] = 0 |  | ||||||
|     ctxt[N0lv] = min_(st.n_L(st.B(0)), 5) |  | ||||||
|     ctxt[S0lv] = min_(st.n_L(st.S(0)), 5) |  | ||||||
|     ctxt[S0rv] = min_(st.n_R(st.S(0)), 5) |  | ||||||
|     ctxt[S1lv] = min_(st.n_L(st.S(1)), 5) |  | ||||||
|     ctxt[S1rv] = min_(st.n_R(st.S(1)), 5) |  | ||||||
| 
 |  | ||||||
|     ctxt[S0_has_head] = 0 |  | ||||||
|     ctxt[S1_has_head] = 0 |  | ||||||
|     ctxt[S2_has_head] = 0 |  | ||||||
|     if st.stack_depth() >= 1: |  | ||||||
|         ctxt[S0_has_head] = st.has_head(st.S(0)) + 1 |  | ||||||
|         if st.stack_depth() >= 2: |  | ||||||
|             ctxt[S1_has_head] = st.has_head(st.S(1)) + 1 |  | ||||||
|             if st.stack_depth() >= 3: |  | ||||||
|                 ctxt[S2_has_head] = st.has_head(st.S(2)) + 1 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef inline int min_(int a, int b) nogil: |  | ||||||
|     return a if a > b else b |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| ner = ( |  | ||||||
|     (N0W,), |  | ||||||
|     (P1W,), |  | ||||||
|     (N1W,), |  | ||||||
|     (P2W,), |  | ||||||
|     (N2W,), |  | ||||||
| 
 |  | ||||||
|     (P1W, N0W,), |  | ||||||
|     (N0W, N1W), |  | ||||||
| 
 |  | ||||||
|     (N0_prefix,), |  | ||||||
|     (N0_suffix,), |  | ||||||
| 
 |  | ||||||
|     (P1_shape,), |  | ||||||
|     (N0_shape,), |  | ||||||
|     (N1_shape,), |  | ||||||
|     (P1_shape, N0_shape,), |  | ||||||
|     (N0_shape, P1_shape,), |  | ||||||
|     (P1_shape, N0_shape, N1_shape), |  | ||||||
|     (N2_shape,), |  | ||||||
|     (P2_shape,), |  | ||||||
| 
 |  | ||||||
|     #(P2_norm, P1_norm, W_norm), |  | ||||||
|     #(P1_norm, W_norm, N1_norm), |  | ||||||
|     #(W_norm, N1_norm, N2_norm) |  | ||||||
| 
 |  | ||||||
|     (P2p,), |  | ||||||
|     (P1p,), |  | ||||||
|     (N0p,), |  | ||||||
|     (N1p,), |  | ||||||
|     (N2p,), |  | ||||||
| 
 |  | ||||||
|     (P1p, N0p), |  | ||||||
|     (N0p, N1p), |  | ||||||
|     (P2p, P1p, N0p), |  | ||||||
|     (P1p, N0p, N1p), |  | ||||||
|     (N0p, N1p, N2p), |  | ||||||
| 
 |  | ||||||
|     (P2c,), |  | ||||||
|     (P1c,), |  | ||||||
|     (N0c,), |  | ||||||
|     (N1c,), |  | ||||||
|     (N2c,), |  | ||||||
| 
 |  | ||||||
|     (P1c, N0c), |  | ||||||
|     (N0c, N1c), |  | ||||||
| 
 |  | ||||||
|     (E0W,), |  | ||||||
|     (E0c,), |  | ||||||
|     (E0p,), |  | ||||||
| 
 |  | ||||||
|     (E0W, N0W), |  | ||||||
|     (E0c, N0W), |  | ||||||
|     (E0p, N0W), |  | ||||||
| 
 |  | ||||||
|     (E0p, P1p, N0p), |  | ||||||
|     (E0c, P1c, N0c), |  | ||||||
| 
 |  | ||||||
|     (E0w, P1c), |  | ||||||
|     (E0p, P1p), |  | ||||||
|     (E0c, P1c), |  | ||||||
|     (E0p, E1p), |  | ||||||
|     (E0c, P1p), |  | ||||||
| 
 |  | ||||||
|     (E1W,), |  | ||||||
|     (E1c,), |  | ||||||
|     (E1p,), |  | ||||||
| 
 |  | ||||||
|     (E0W, E1W), |  | ||||||
|     (E0W, E1p,), |  | ||||||
|     (E0p, E1W,), |  | ||||||
|     (E0p, E1W), |  | ||||||
| 
 |  | ||||||
|     (P1_ne_iob,), |  | ||||||
|     (P1_ne_iob, P1_ne_type), |  | ||||||
|     (N0w, P1_ne_iob, P1_ne_type), |  | ||||||
| 
 |  | ||||||
|     (N0_shape,), |  | ||||||
|     (N1_shape,), |  | ||||||
|     (N2_shape,), |  | ||||||
|     (P1_shape,), |  | ||||||
|     (P2_shape,), |  | ||||||
| 
 |  | ||||||
|     (N0_prefix,), |  | ||||||
|     (N0_suffix,), |  | ||||||
| 
 |  | ||||||
|     (P1_ne_iob,), |  | ||||||
|     (P2_ne_iob,), |  | ||||||
|     (P1_ne_iob, P2_ne_iob), |  | ||||||
|     (P1_ne_iob, P1_ne_type), |  | ||||||
|     (P2_ne_iob, P2_ne_type), |  | ||||||
|     (N0w, P1_ne_iob, P1_ne_type), |  | ||||||
| 
 |  | ||||||
|     (N0w, N1w), |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| unigrams = ( |  | ||||||
|     (S2W, S2p), |  | ||||||
|     (S2c6, S2p), |  | ||||||
| 
 |  | ||||||
|     (S1W, S1p), |  | ||||||
|     (S1c6, S1p), |  | ||||||
| 
 |  | ||||||
|     (S0W, S0p), |  | ||||||
|     (S0c6, S0p), |  | ||||||
| 
 |  | ||||||
|     (N0W, N0p), |  | ||||||
|     (N0p,), |  | ||||||
|     (N0c,), |  | ||||||
|     (N0c6, N0p), |  | ||||||
|     (N0L,), |  | ||||||
| 
 |  | ||||||
|     (N1W, N1p), |  | ||||||
|     (N1c6, N1p), |  | ||||||
| 
 |  | ||||||
|     (N2W, N2p), |  | ||||||
|     (N2c6, N2p), |  | ||||||
| 
 |  | ||||||
|     (S0r2W, S0r2p), |  | ||||||
|     (S0r2c6, S0r2p), |  | ||||||
|     (S0r2L,), |  | ||||||
| 
 |  | ||||||
|     (S0rW, S0rp), |  | ||||||
|     (S0rc6, S0rp), |  | ||||||
|     (S0rL,), |  | ||||||
| 
 |  | ||||||
|     (S0l2W, S0l2p), |  | ||||||
|     (S0l2c6, S0l2p), |  | ||||||
|     (S0l2L,), |  | ||||||
| 
 |  | ||||||
|     (S0lW, S0lp), |  | ||||||
|     (S0lc6, S0lp), |  | ||||||
|     (S0lL,), |  | ||||||
| 
 |  | ||||||
|     (N0l2W, N0l2p), |  | ||||||
|     (N0l2c6, N0l2p), |  | ||||||
|     (N0l2L,), |  | ||||||
| 
 |  | ||||||
|     (N0lW, N0lp), |  | ||||||
|     (N0lc6, N0lp), |  | ||||||
|     (N0lL,), |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| s0_n0 = ( |  | ||||||
|     (S0W, S0p, N0W, N0p), |  | ||||||
|     (S0c, S0p, N0c, N0p), |  | ||||||
|     (S0c6, S0p, N0c6, N0p), |  | ||||||
|     (S0c4, S0p, N0c4, N0p), |  | ||||||
|     (S0p, N0p), |  | ||||||
|     (S0W, N0p), |  | ||||||
|     (S0p, N0W), |  | ||||||
|     (S0W, N0c), |  | ||||||
|     (S0c, N0W), |  | ||||||
|     (S0p, N0c), |  | ||||||
|     (S0c, N0p), |  | ||||||
|     (S0W, S0rp, N0p), |  | ||||||
|     (S0p, S0rp, N0p), |  | ||||||
|     (S0p, N0lp, N0W), |  | ||||||
|     (S0p, N0lp, N0p), |  | ||||||
|     (S0L, N0p), |  | ||||||
|     (S0p, S0rL, N0p), |  | ||||||
|     (S0p, N0lL, N0p), |  | ||||||
|     (S0p, S0rv, N0p), |  | ||||||
|     (S0p, N0lv, N0p), |  | ||||||
|     (S0c6, S0rL, S0r2L, N0p), |  | ||||||
|     (S0p, N0lL, N0l2L, N0p), |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| s1_s0 = ( |  | ||||||
|     (S1p, S0p), |  | ||||||
|     (S1p, S0p, S0_has_head), |  | ||||||
|     (S1W, S0p), |  | ||||||
|     (S1W, S0p, S0_has_head), |  | ||||||
|     (S1c, S0p), |  | ||||||
|     (S1c, S0p, S0_has_head), |  | ||||||
|     (S1p, S1rL, S0p), |  | ||||||
|     (S1p, S1rL, S0p, S0_has_head), |  | ||||||
|     (S1p, S0lL, S0p), |  | ||||||
|     (S1p, S0lL, S0p, S0_has_head), |  | ||||||
|     (S1p, S0lL, S0l2L, S0p), |  | ||||||
|     (S1p, S0lL, S0l2L, S0p, S0_has_head), |  | ||||||
|     (S1L, S0L, S0W), |  | ||||||
|     (S1L, S0L, S0p), |  | ||||||
|     (S1p, S1L, S0L, S0p), |  | ||||||
|     (S1p, S0p), |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| s1_n0 = ( |  | ||||||
|     (S1p, N0p), |  | ||||||
|     (S1c, N0c), |  | ||||||
|     (S1c, N0p), |  | ||||||
|     (S1p, N0c), |  | ||||||
|     (S1W, S1p, N0p), |  | ||||||
|     (S1p, N0W, N0p), |  | ||||||
|     (S1c6, S1p, N0c6, N0p), |  | ||||||
|     (S1L, N0p), |  | ||||||
|     (S1p, S1rL, N0p), |  | ||||||
|     (S1p, S1rp, N0p), |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| s0_n1 = ( |  | ||||||
|     (S0p, N1p), |  | ||||||
|     (S0c, N1c), |  | ||||||
|     (S0c, N1p), |  | ||||||
|     (S0p, N1c), |  | ||||||
|     (S0W, S0p, N1p), |  | ||||||
|     (S0p, N1W, N1p), |  | ||||||
|     (S0c6, S0p, N1c6, N1p), |  | ||||||
|     (S0L, N1p), |  | ||||||
|     (S0p, S0rL, N1p), |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| n0_n1 = ( |  | ||||||
|     (N0W, N0p, N1W, N1p), |  | ||||||
|     (N0W, N0p, N1p), |  | ||||||
|     (N0p, N1W, N1p), |  | ||||||
|     (N0c, N0p, N1c, N1p), |  | ||||||
|     (N0c6, N0p, N1c6, N1p), |  | ||||||
|     (N0c, N1c), |  | ||||||
|     (N0p, N1c), |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| tree_shape = ( |  | ||||||
|     (dist,), |  | ||||||
|     (S0p, S0_has_head, S1_has_head, S2_has_head), |  | ||||||
|     (S0p, S0lv, S0rv), |  | ||||||
|     (N0p, N0lv), |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| trigrams = ( |  | ||||||
|     (N0p, N1p, N2p), |  | ||||||
|     (S0p, S0lp, S0l2p), |  | ||||||
|     (S0p, S0rp, S0r2p), |  | ||||||
|     (S0p, S1p, S2p), |  | ||||||
|     (S1p, S0p, N0p), |  | ||||||
|     (S0p, S0lp, N0p), |  | ||||||
|     (S0p, N0p, N0lp), |  | ||||||
|     (N0p, N0lp, N0l2p), |  | ||||||
| 
 |  | ||||||
|     (S0W, S0p, S0rL, S0r2L), |  | ||||||
|     (S0p, S0rL, S0r2L), |  | ||||||
| 
 |  | ||||||
|     (S0W, S0p, S0lL, S0l2L), |  | ||||||
|     (S0p, S0lL, S0l2L), |  | ||||||
| 
 |  | ||||||
|     (N0W, N0p, N0lL, N0l2L), |  | ||||||
|     (N0p, N0lL, N0l2L), |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| words = ( |  | ||||||
|     S2w, |  | ||||||
|     S1w, |  | ||||||
|     S1rw, |  | ||||||
|     S0lw, |  | ||||||
|     S0l2w, |  | ||||||
|     S0w, |  | ||||||
|     S0r2w, |  | ||||||
|     S0rw, |  | ||||||
|     N0lw, |  | ||||||
|     N0l2w, |  | ||||||
|     N0w, |  | ||||||
|     N1w, |  | ||||||
|     N2w, |  | ||||||
|     P1w, |  | ||||||
|     P2w |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| tags = ( |  | ||||||
|     S2p, |  | ||||||
|     S1p, |  | ||||||
|     S1rp, |  | ||||||
|     S0lp, |  | ||||||
|     S0l2p, |  | ||||||
|     S0p, |  | ||||||
|     S0r2p, |  | ||||||
|     S0rp, |  | ||||||
|     N0lp, |  | ||||||
|     N0l2p, |  | ||||||
|     N0p, |  | ||||||
|     N1p, |  | ||||||
|     N2p, |  | ||||||
|     P1p, |  | ||||||
|     P2p |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| labels = ( |  | ||||||
|     S2L, |  | ||||||
|     S1L, |  | ||||||
|     S1rL, |  | ||||||
|     S0lL, |  | ||||||
|     S0l2L, |  | ||||||
|     S0L, |  | ||||||
|     S0r2L, |  | ||||||
|     S0rL, |  | ||||||
|     N0lL, |  | ||||||
|     N0l2L, |  | ||||||
|     N0L, |  | ||||||
|     N1L, |  | ||||||
|     N2L, |  | ||||||
|     P1L, |  | ||||||
|     P2L |  | ||||||
| ) |  | ||||||
|  | @ -1,10 +0,0 @@ | ||||||
| from .parser cimport Parser |  | ||||||
| from ..structs cimport TokenC |  | ||||||
| from thinc.typedefs cimport weight_t |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef class BeamParser(Parser): |  | ||||||
|     cdef public int beam_width |  | ||||||
|     cdef public weight_t beam_density |  | ||||||
| 
 |  | ||||||
|     cdef int _parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) except -1 |  | ||||||
|  | @ -1,239 +0,0 @@ | ||||||
| """ |  | ||||||
| MALT-style dependency parser |  | ||||||
| """ |  | ||||||
| # cython: profile=True |  | ||||||
| # cython: experimental_cpp_class_def=True |  | ||||||
| # cython: cdivision=True |  | ||||||
| # cython: infer_types=True |  | ||||||
| # coding: utf-8 |  | ||||||
| 
 |  | ||||||
| from __future__ import unicode_literals, print_function |  | ||||||
| cimport cython |  | ||||||
| 
 |  | ||||||
| from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF |  | ||||||
| from libc.stdint cimport uint32_t, uint64_t |  | ||||||
| from libc.string cimport memset, memcpy |  | ||||||
| from libc.stdlib cimport rand |  | ||||||
| from libc.math cimport log, exp, isnan, isinf |  | ||||||
| from cymem.cymem cimport Pool, Address |  | ||||||
| from murmurhash.mrmr cimport real_hash64 as hash64 |  | ||||||
| from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t |  | ||||||
| from thinc.linear.features cimport ConjunctionExtracter |  | ||||||
| from thinc.structs cimport FeatureC, ExampleC |  | ||||||
| from thinc.extra.search cimport Beam, MaxViolation |  | ||||||
| from thinc.extra.eg cimport Example |  | ||||||
| from thinc.extra.mb cimport Minibatch |  | ||||||
| 
 |  | ||||||
| from ..structs cimport TokenC |  | ||||||
| from ..tokens.doc cimport Doc |  | ||||||
| from ..strings cimport StringStore |  | ||||||
| from .transition_system cimport TransitionSystem, Transition |  | ||||||
| from ..gold cimport GoldParse |  | ||||||
| from . import _parse_features |  | ||||||
| from ._parse_features cimport CONTEXT_SIZE |  | ||||||
| from ._parse_features cimport fill_context |  | ||||||
| from .stateclass cimport StateClass |  | ||||||
| from .parser cimport Parser |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| DEBUG = False |  | ||||||
| def set_debug(val): |  | ||||||
|     global DEBUG |  | ||||||
|     DEBUG = val |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def get_templates(name): |  | ||||||
|     pf = _parse_features |  | ||||||
|     if name == 'ner': |  | ||||||
|         return pf.ner |  | ||||||
|     elif name == 'debug': |  | ||||||
|         return pf.unigrams |  | ||||||
|     else: |  | ||||||
|         return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \ |  | ||||||
|                 pf.tree_shape + pf.trigrams) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef int BEAM_WIDTH = 16 |  | ||||||
| cdef weight_t BEAM_DENSITY = 0.001 |  | ||||||
| 
 |  | ||||||
| cdef class BeamParser(Parser): |  | ||||||
|     def __init__(self, *args, **kwargs): |  | ||||||
|         self.beam_width = kwargs.get('beam_width', BEAM_WIDTH) |  | ||||||
|         self.beam_density = kwargs.get('beam_density', BEAM_DENSITY) |  | ||||||
|         Parser.__init__(self, *args, **kwargs) |  | ||||||
| 
 |  | ||||||
|     cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil: |  | ||||||
|         with gil: |  | ||||||
|             self._parseC(tokens, length, nr_feat, self.moves.n_moves) |  | ||||||
| 
 |  | ||||||
|     cdef int _parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) except -1: |  | ||||||
|         cdef Beam beam = Beam(self.moves.n_moves, self.beam_width, min_density=self.beam_density) |  | ||||||
|         # TODO: How do we handle new labels here? This increases nr_class |  | ||||||
|         beam.initialize(self.moves.init_beam_state, length, tokens) |  | ||||||
|         beam.check_done(_check_final_state, NULL) |  | ||||||
|         if beam.is_done: |  | ||||||
|             _cleanup(beam) |  | ||||||
|             return 0 |  | ||||||
|         while not beam.is_done: |  | ||||||
|             self._advance_beam(beam, None, False) |  | ||||||
|         state = <StateClass>beam.at(0) |  | ||||||
|         self.moves.finalize_state(state.c) |  | ||||||
|         for i in range(length): |  | ||||||
|             tokens[i] = state.c._sent[i] |  | ||||||
|         _cleanup(beam) |  | ||||||
| 
 |  | ||||||
|     def update(self, Doc tokens, GoldParse gold_parse, itn=0): |  | ||||||
|         self.moves.preprocess_gold(gold_parse) |  | ||||||
|         cdef Beam pred = Beam(self.moves.n_moves, self.beam_width) |  | ||||||
|         pred.initialize(self.moves.init_beam_state, tokens.length, tokens.c) |  | ||||||
|         pred.check_done(_check_final_state, NULL) |  | ||||||
|         # Hack for NER |  | ||||||
|         for i in range(pred.size): |  | ||||||
|             stcls = <StateClass>pred.at(i) |  | ||||||
|             self.moves.initialize_state(stcls.c) |  | ||||||
| 
 |  | ||||||
|         cdef Beam gold = Beam(self.moves.n_moves, self.beam_width, min_density=0.0) |  | ||||||
|         gold.initialize(self.moves.init_beam_state, tokens.length, tokens.c) |  | ||||||
|         gold.check_done(_check_final_state, NULL) |  | ||||||
|         violn = MaxViolation() |  | ||||||
|         while not pred.is_done and not gold.is_done: |  | ||||||
|             # We search separately here, to allow for ambiguity in the gold parse. |  | ||||||
|             self._advance_beam(pred, gold_parse, False) |  | ||||||
|             self._advance_beam(gold, gold_parse, True) |  | ||||||
|             violn.check_crf(pred, gold) |  | ||||||
|             if pred.loss > 0 and pred.min_score > (gold.score + self.model.time): |  | ||||||
|                 break |  | ||||||
|         else: |  | ||||||
|             # The non-monotonic oracle makes it difficult to ensure final costs are |  | ||||||
|             # correct. Therefore do final correction |  | ||||||
|             for i in range(pred.size): |  | ||||||
|                 if self.moves.is_gold_parse(<StateClass>pred.at(i), gold_parse): |  | ||||||
|                     pred._states[i].loss = 0.0 |  | ||||||
|                 elif pred._states[i].loss == 0.0: |  | ||||||
|                     pred._states[i].loss = 1.0 |  | ||||||
|             violn.check_crf(pred, gold) |  | ||||||
|         if pred.size < 1: |  | ||||||
|             raise Exception("No candidates", tokens.length) |  | ||||||
|         if gold.size < 1: |  | ||||||
|             raise Exception("No gold", tokens.length) |  | ||||||
|         if pred.loss == 0: |  | ||||||
|             self.model.update_from_histories(self.moves, tokens, [(0.0, [])]) |  | ||||||
|         elif True: |  | ||||||
|             #_check_train_integrity(pred, gold, gold_parse, self.moves) |  | ||||||
|             histories = list(zip(violn.p_probs, violn.p_hist)) + \ |  | ||||||
|                         list(zip(violn.g_probs, violn.g_hist)) |  | ||||||
|             self.model.update_from_histories(self.moves, tokens, histories, min_grad=0.001**(itn+1)) |  | ||||||
|         else: |  | ||||||
|             self.model.update_from_histories(self.moves, tokens, |  | ||||||
|                 [(1.0, violn.p_hist[0]), (-1.0, violn.g_hist[0])]) |  | ||||||
|         _cleanup(pred) |  | ||||||
|         _cleanup(gold) |  | ||||||
|         return pred.loss |  | ||||||
| 
 |  | ||||||
|     def _advance_beam(self, Beam beam, GoldParse gold, bint follow_gold): |  | ||||||
|         cdef atom_t[CONTEXT_SIZE] context |  | ||||||
|         cdef Pool mem = Pool() |  | ||||||
|         features = <FeatureC*>mem.alloc(self.model.nr_feat, sizeof(FeatureC)) |  | ||||||
|         if False: |  | ||||||
|             mb = Minibatch(self.model.widths, beam.size) |  | ||||||
|             for i in range(beam.size): |  | ||||||
|                 stcls = <StateClass>beam.at(i) |  | ||||||
|                 if stcls.c.is_final(): |  | ||||||
|                     nr_feat = 0 |  | ||||||
|                 else: |  | ||||||
|                     nr_feat = self.model.set_featuresC(context, features, stcls.c) |  | ||||||
|                     self.moves.set_valid(beam.is_valid[i], stcls.c) |  | ||||||
|                 mb.c.push_back(features, nr_feat, beam.costs[i], beam.is_valid[i], 0) |  | ||||||
|             self.model(mb) |  | ||||||
|             for i in range(beam.size): |  | ||||||
|                 memcpy(beam.scores[i], mb.c.scores(i), mb.c.nr_out() * sizeof(beam.scores[i][0])) |  | ||||||
|         else: |  | ||||||
|             for i in range(beam.size): |  | ||||||
|                 stcls = <StateClass>beam.at(i) |  | ||||||
|                 if not stcls.is_final(): |  | ||||||
|                     nr_feat = self.model.set_featuresC(context, features, stcls.c) |  | ||||||
|                     self.moves.set_valid(beam.is_valid[i], stcls.c) |  | ||||||
|                     self.model.set_scoresC(beam.scores[i], features, nr_feat) |  | ||||||
|         if gold is not None: |  | ||||||
|             n_gold = 0 |  | ||||||
|             lines = [] |  | ||||||
|             for i in range(beam.size): |  | ||||||
|                 stcls = <StateClass>beam.at(i) |  | ||||||
|                 if not stcls.c.is_final(): |  | ||||||
|                     self.moves.set_costs(beam.is_valid[i], beam.costs[i], stcls, gold) |  | ||||||
|                     if follow_gold: |  | ||||||
|                         for j in range(self.moves.n_moves): |  | ||||||
|                             if beam.costs[i][j] >= 1: |  | ||||||
|                                 beam.is_valid[i][j] = 0 |  | ||||||
|                                 lines.append((stcls.B(0), stcls.B(1), |  | ||||||
|                                     stcls.B_(0).ent_iob, stcls.B_(1).ent_iob, |  | ||||||
|                                     stcls.B_(1).sent_start, |  | ||||||
|                                     j, |  | ||||||
|                                     beam.is_valid[i][j], 'set invalid', |  | ||||||
|                                     beam.costs[i][j], self.moves.c[j].move, self.moves.c[j].label)) |  | ||||||
|                             n_gold += 1 if beam.is_valid[i][j] else 0 |  | ||||||
|             if follow_gold and n_gold == 0: |  | ||||||
|                 raise Exception("No gold") |  | ||||||
|         if follow_gold: |  | ||||||
|             beam.advance(_transition_state, NULL, <void*>self.moves.c) |  | ||||||
|         else: |  | ||||||
|             beam.advance(_transition_state, _hash_state, <void*>self.moves.c) |  | ||||||
|         beam.check_done(_check_final_state, NULL) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # These are passed as callbacks to thinc.search.Beam |  | ||||||
| cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1: |  | ||||||
|     dest = <StateClass>_dest |  | ||||||
|     src = <StateClass>_src |  | ||||||
|     moves = <const Transition*>_moves |  | ||||||
|     dest.clone(src) |  | ||||||
|     moves[clas].do(dest.c, moves[clas].label) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef int _check_final_state(void* _state, void* extra_args) except -1: |  | ||||||
|     return (<StateClass>_state).is_final() |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def _cleanup(Beam beam): |  | ||||||
|     for i in range(beam.width): |  | ||||||
|         Py_XDECREF(<PyObject*>beam._states[i].content) |  | ||||||
|         Py_XDECREF(<PyObject*>beam._parents[i].content) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef hash_t _hash_state(void* _state, void* _) except 0: |  | ||||||
|     state = <StateClass>_state |  | ||||||
|     if state.c.is_final(): |  | ||||||
|         return 1 |  | ||||||
|     else: |  | ||||||
|         return state.c.hash() |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, TransitionSystem moves): |  | ||||||
|     for i in range(pred.size): |  | ||||||
|         if not pred._states[i].is_done or pred._states[i].loss == 0: |  | ||||||
|             continue |  | ||||||
|         state = <StateClass>pred.at(i) |  | ||||||
|         if moves.is_gold_parse(state, gold_parse) == True: |  | ||||||
|             for dep in gold_parse.orig_annot: |  | ||||||
|                 print(dep[1], dep[3], dep[4]) |  | ||||||
|             print("Cost", pred._states[i].loss) |  | ||||||
|             for j in range(gold_parse.length): |  | ||||||
|                 print(gold_parse.orig_annot[j][1], state.H(j), moves.strings[state.safe_get(j).dep]) |  | ||||||
|             acts = [moves.c[clas].move for clas in pred.histories[i]] |  | ||||||
|             labels = [moves.c[clas].label for clas in pred.histories[i]] |  | ||||||
|             print([moves.move_name(move, label) for move, label in zip(acts, labels)]) |  | ||||||
|             raise Exception("Predicted state is gold-standard") |  | ||||||
|     for i in range(gold.size): |  | ||||||
|         if not gold._states[i].is_done: |  | ||||||
|             continue |  | ||||||
|         state = <StateClass>gold.at(i) |  | ||||||
|         if moves.is_gold(state, gold_parse) == False: |  | ||||||
|             print("Truth") |  | ||||||
|             for dep in gold_parse.orig_annot: |  | ||||||
|                 print(dep[1], dep[3], dep[4]) |  | ||||||
|             print("Predicted good") |  | ||||||
|             for j in range(gold_parse.length): |  | ||||||
|                 print(gold_parse.orig_annot[j][1], state.H(j), moves.strings[state.safe_get(j).dep]) |  | ||||||
|             raise Exception("Gold parse is not gold-standard") |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|  | @ -1,24 +0,0 @@ | ||||||
| from thinc.linear.avgtron cimport AveragedPerceptron |  | ||||||
| from thinc.typedefs cimport atom_t |  | ||||||
| from thinc.structs cimport FeatureC |  | ||||||
| 
 |  | ||||||
| from .stateclass cimport StateClass |  | ||||||
| from .arc_eager cimport TransitionSystem |  | ||||||
| from ..vocab cimport Vocab |  | ||||||
| from ..tokens.doc cimport Doc |  | ||||||
| from ..structs cimport TokenC |  | ||||||
| from ._state cimport StateC |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef class ParserModel(AveragedPerceptron): |  | ||||||
|     cdef int set_featuresC(self, atom_t* context, FeatureC* features, |  | ||||||
|                             const StateC* state) nogil |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef class Parser: |  | ||||||
|     cdef readonly Vocab vocab |  | ||||||
|     cdef readonly ParserModel model |  | ||||||
|     cdef readonly TransitionSystem moves |  | ||||||
|     cdef readonly object cfg |  | ||||||
| 
 |  | ||||||
|     cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil |  | ||||||
|  | @ -1,526 +0,0 @@ | ||||||
| """ |  | ||||||
| MALT-style dependency parser |  | ||||||
| """ |  | ||||||
| # coding: utf-8 |  | ||||||
| # cython: infer_types=True |  | ||||||
| from __future__ import unicode_literals |  | ||||||
| 
 |  | ||||||
| from collections import Counter |  | ||||||
| import ujson |  | ||||||
| 
 |  | ||||||
| cimport cython |  | ||||||
| cimport cython.parallel |  | ||||||
| 
 |  | ||||||
| import numpy.random |  | ||||||
| 
 |  | ||||||
| from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF |  | ||||||
| from cpython.exc cimport PyErr_CheckSignals |  | ||||||
| from libc.stdint cimport uint32_t, uint64_t |  | ||||||
| from libc.string cimport memset, memcpy |  | ||||||
| from libc.stdlib cimport malloc, calloc, free |  | ||||||
| from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t |  | ||||||
| from thinc.linear.avgtron cimport AveragedPerceptron |  | ||||||
| from thinc.linalg cimport VecVec |  | ||||||
| from thinc.structs cimport SparseArrayC, FeatureC, ExampleC |  | ||||||
| from thinc.extra.eg cimport Example |  | ||||||
| from cymem.cymem cimport Pool, Address |  | ||||||
| from murmurhash.mrmr cimport hash64 |  | ||||||
| from preshed.maps cimport MapStruct |  | ||||||
| from preshed.maps cimport map_get |  | ||||||
| 
 |  | ||||||
| from . import _parse_features |  | ||||||
| from ._parse_features cimport CONTEXT_SIZE |  | ||||||
| from ._parse_features cimport fill_context |  | ||||||
| from .stateclass cimport StateClass |  | ||||||
| from ._state cimport StateC |  | ||||||
| from .transition_system import OracleError |  | ||||||
| from .transition_system cimport TransitionSystem, Transition |  | ||||||
| from ..structs cimport TokenC |  | ||||||
| from ..tokens.doc cimport Doc |  | ||||||
| from ..strings cimport StringStore |  | ||||||
| from ..gold cimport GoldParse |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| USE_FTRL = True |  | ||||||
| DEBUG = False |  | ||||||
| def set_debug(val): |  | ||||||
|     global DEBUG |  | ||||||
|     DEBUG = val |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def get_templates(name): |  | ||||||
|     pf = _parse_features |  | ||||||
|     if name == 'ner': |  | ||||||
|         return pf.ner |  | ||||||
|     elif name == 'debug': |  | ||||||
|         return pf.unigrams |  | ||||||
|     elif name.startswith('embed'): |  | ||||||
|         return (pf.words, pf.tags, pf.labels) |  | ||||||
|     else: |  | ||||||
|         return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \ |  | ||||||
|                 pf.tree_shape + pf.trigrams) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef class ParserModel(AveragedPerceptron): |  | ||||||
|     cdef int set_featuresC(self, atom_t* context, FeatureC* features, |  | ||||||
|             const StateC* state) nogil: |  | ||||||
|         fill_context(context, state) |  | ||||||
|         nr_feat = self.extracter.set_features(features, context) |  | ||||||
|         return nr_feat |  | ||||||
| 
 |  | ||||||
|     def update(self, Example eg, itn=0): |  | ||||||
|         """ |  | ||||||
|         Does regression on negative cost. Sort of cute? |  | ||||||
|         """ |  | ||||||
|         self.time += 1 |  | ||||||
|         cdef int best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class) |  | ||||||
|         cdef int guess = eg.guess |  | ||||||
|         if guess == best or best == -1: |  | ||||||
|             return 0.0 |  | ||||||
|         cdef FeatureC feat |  | ||||||
|         cdef int clas |  | ||||||
|         cdef weight_t gradient |  | ||||||
|         if USE_FTRL: |  | ||||||
|             for feat in eg.c.features[:eg.c.nr_feat]: |  | ||||||
|                 for clas in range(eg.c.nr_class): |  | ||||||
|                     if eg.c.is_valid[clas] and eg.c.scores[clas] >= eg.c.scores[best]: |  | ||||||
|                         gradient = eg.c.scores[clas] + eg.c.costs[clas] |  | ||||||
|                         self.update_weight_ftrl(feat.key, clas, feat.value * gradient) |  | ||||||
|         else: |  | ||||||
|             for feat in eg.c.features[:eg.c.nr_feat]: |  | ||||||
|                 self.update_weight(feat.key, guess, feat.value * eg.c.costs[guess]) |  | ||||||
|                 self.update_weight(feat.key, best, -feat.value * eg.c.costs[guess]) |  | ||||||
|         return eg.c.costs[guess] |  | ||||||
| 
 |  | ||||||
|     def update_from_histories(self, TransitionSystem moves, Doc doc, histories, weight_t min_grad=0.0): |  | ||||||
|         cdef Pool mem = Pool() |  | ||||||
|         features = <FeatureC*>mem.alloc(self.nr_feat, sizeof(FeatureC)) |  | ||||||
| 
 |  | ||||||
|         cdef StateClass stcls |  | ||||||
| 
 |  | ||||||
|         cdef class_t clas |  | ||||||
|         self.time += 1 |  | ||||||
|         cdef atom_t[CONTEXT_SIZE] atoms |  | ||||||
|         histories = [(grad, hist) for grad, hist in histories if abs(grad) >= min_grad and hist] |  | ||||||
|         if not histories: |  | ||||||
|             return None |  | ||||||
|         gradient = [Counter() for _ in range(max([max(h)+1 for _, h in histories]))] |  | ||||||
|         for d_loss, history in histories: |  | ||||||
|             stcls = StateClass.init(doc.c, doc.length) |  | ||||||
|             moves.initialize_state(stcls.c) |  | ||||||
|             for clas in history: |  | ||||||
|                 nr_feat = self.set_featuresC(atoms, features, stcls.c) |  | ||||||
|                 clas_grad = gradient[clas] |  | ||||||
|                 for feat in features[:nr_feat]: |  | ||||||
|                     clas_grad[feat.key] += d_loss * feat.value |  | ||||||
|                 moves.c[clas].do(stcls.c, moves.c[clas].label) |  | ||||||
|         cdef feat_t key |  | ||||||
|         cdef weight_t d_feat |  | ||||||
|         for clas, clas_grad in enumerate(gradient): |  | ||||||
|             for key, d_feat in clas_grad.items(): |  | ||||||
|                 if d_feat != 0: |  | ||||||
|                     self.update_weight_ftrl(key, clas, d_feat) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef class Parser: |  | ||||||
|     """ |  | ||||||
|     Base class of the DependencyParser and EntityRecognizer. |  | ||||||
|     """ |  | ||||||
|     @classmethod |  | ||||||
|     def load(cls, path, Vocab vocab, TransitionSystem=None, require=False, **cfg): |  | ||||||
|         """ |  | ||||||
|         Load the statistical model from the supplied path. |  | ||||||
| 
 |  | ||||||
|         Arguments: |  | ||||||
|             path (Path): |  | ||||||
|                 The path to load from. |  | ||||||
|             vocab (Vocab): |  | ||||||
|                 The vocabulary. Must be shared by the documents to be processed. |  | ||||||
|             require (bool): |  | ||||||
|                 Whether to raise an error if the files are not found. |  | ||||||
|         Returns (Parser): |  | ||||||
|             The newly constructed object. |  | ||||||
|         """ |  | ||||||
|         with (path / 'config.json').open() as file_: |  | ||||||
|             cfg = ujson.load(file_) |  | ||||||
|         # TODO: remove this shim when we don't have to support older data |  | ||||||
|         if 'labels' in cfg and 'actions' not in cfg: |  | ||||||
|             cfg['actions'] = cfg.pop('labels') |  | ||||||
|         # TODO: remove this shim when we don't have to support older data |  | ||||||
|         for action_name, labels in dict(cfg.get('actions', {})).items(): |  | ||||||
|             # We need this to be sorted |  | ||||||
|             if isinstance(labels, dict): |  | ||||||
|                 labels = list(sorted(labels.keys())) |  | ||||||
|             cfg['actions'][action_name] = labels |  | ||||||
|         self = cls(vocab, TransitionSystem=TransitionSystem, model=None, **cfg) |  | ||||||
|         if (path / 'model').exists(): |  | ||||||
|             self.model.load(str(path / 'model')) |  | ||||||
|         elif require: |  | ||||||
|             raise IOError( |  | ||||||
|                 "Required file %s/model not found when loading" % str(path)) |  | ||||||
|         return self |  | ||||||
| 
 |  | ||||||
|     def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg): |  | ||||||
|         """ |  | ||||||
|         Create a Parser. |  | ||||||
| 
 |  | ||||||
|         Arguments: |  | ||||||
|             vocab (Vocab): |  | ||||||
|                 The vocabulary object. Must be shared with documents to be processed. |  | ||||||
|             model (thinc.linear.AveragedPerceptron): |  | ||||||
|                 The statistical model. |  | ||||||
|         Returns (Parser): |  | ||||||
|             The newly constructed object. |  | ||||||
|         """ |  | ||||||
|         if TransitionSystem is None: |  | ||||||
|             TransitionSystem = self.TransitionSystem |  | ||||||
|         self.vocab = vocab |  | ||||||
|         cfg['actions'] = TransitionSystem.get_actions(**cfg) |  | ||||||
|         self.moves = TransitionSystem(vocab.strings, cfg['actions']) |  | ||||||
|         # TODO: Remove this when we no longer need to support old-style models |  | ||||||
|         if isinstance(cfg.get('features'), basestring): |  | ||||||
|             cfg['features'] = get_templates(cfg['features']) |  | ||||||
|         elif 'features' not in cfg: |  | ||||||
|             cfg['features'] = self.feature_templates |  | ||||||
| 
 |  | ||||||
|         self.model = ParserModel(cfg['features']) |  | ||||||
|         self.model.l1_penalty = cfg.get('L1', 0.0) |  | ||||||
|         self.model.learn_rate = cfg.get('learn_rate', 0.001) |  | ||||||
| 
 |  | ||||||
|         self.cfg = cfg |  | ||||||
|         # TODO: This is a pretty hacky fix to the problem of adding more |  | ||||||
|         # labels. The issue is they come in out of order, if labels are |  | ||||||
|         # added during training |  | ||||||
|         for label in cfg.get('extra_labels', []): |  | ||||||
|             self.add_label(label) |  | ||||||
| 
 |  | ||||||
|     def __reduce__(self): |  | ||||||
|         return (Parser, (self.vocab, self.moves, self.model), None, None) |  | ||||||
| 
 |  | ||||||
|     def __call__(self, Doc tokens): |  | ||||||
|         """ |  | ||||||
|         Apply the entity recognizer, setting the annotations onto the Doc object. |  | ||||||
| 
 |  | ||||||
|         Arguments: |  | ||||||
|             doc (Doc): The document to be processed. |  | ||||||
|         Returns: |  | ||||||
|             None |  | ||||||
|         """ |  | ||||||
|         cdef int nr_feat = self.model.nr_feat |  | ||||||
|         with nogil: |  | ||||||
|             status = self.parseC(tokens.c, tokens.length, nr_feat) |  | ||||||
|         # Check for KeyboardInterrupt etc. Untested |  | ||||||
|         PyErr_CheckSignals() |  | ||||||
|         if status != 0: |  | ||||||
|             raise ParserStateError(tokens) |  | ||||||
|         self.moves.finalize_doc(tokens) |  | ||||||
| 
 |  | ||||||
|     def pipe(self, stream, int batch_size=1000, int n_threads=2): |  | ||||||
|         """ |  | ||||||
|         Process a stream of documents. |  | ||||||
| 
 |  | ||||||
|         Arguments: |  | ||||||
|             stream: The sequence of documents to process. |  | ||||||
|             batch_size (int): |  | ||||||
|                 The number of documents to accumulate into a working set. |  | ||||||
|             n_threads (int): |  | ||||||
|                 The number of threads with which to work on the buffer in parallel. |  | ||||||
|         Yields (Doc): Documents, in order. |  | ||||||
|         """ |  | ||||||
|         cdef Pool mem = Pool() |  | ||||||
|         cdef TokenC** doc_ptr = <TokenC**>mem.alloc(batch_size, sizeof(TokenC*)) |  | ||||||
|         cdef int* lengths = <int*>mem.alloc(batch_size, sizeof(int)) |  | ||||||
|         cdef Doc doc |  | ||||||
|         cdef int i |  | ||||||
|         cdef int nr_feat = self.model.nr_feat |  | ||||||
|         cdef int status |  | ||||||
|         queue = [] |  | ||||||
|         for doc in stream: |  | ||||||
|             doc_ptr[len(queue)] = doc.c |  | ||||||
|             lengths[len(queue)] = doc.length |  | ||||||
|             queue.append(doc) |  | ||||||
|             if len(queue) == batch_size: |  | ||||||
|                 with nogil: |  | ||||||
|                     for i in cython.parallel.prange(batch_size, num_threads=n_threads): |  | ||||||
|                         status = self.parseC(doc_ptr[i], lengths[i], nr_feat) |  | ||||||
|                         if status != 0: |  | ||||||
|                             with gil: |  | ||||||
|                                 raise ParserStateError(queue[i]) |  | ||||||
|                 PyErr_CheckSignals() |  | ||||||
|                 for doc in queue: |  | ||||||
|                     self.moves.finalize_doc(doc) |  | ||||||
|                     yield doc |  | ||||||
|                 queue = [] |  | ||||||
|         batch_size = len(queue) |  | ||||||
|         with nogil: |  | ||||||
|             for i in cython.parallel.prange(batch_size, num_threads=n_threads): |  | ||||||
|                 status = self.parseC(doc_ptr[i], lengths[i], nr_feat) |  | ||||||
|                 if status != 0: |  | ||||||
|                     with gil: |  | ||||||
|                         raise ParserStateError(queue[i]) |  | ||||||
|         PyErr_CheckSignals() |  | ||||||
|         for doc in queue: |  | ||||||
|             self.moves.finalize_doc(doc) |  | ||||||
|             yield doc |  | ||||||
| 
 |  | ||||||
|     cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil: |  | ||||||
|         state = new StateC(tokens, length) |  | ||||||
|         # NB: This can change self.moves.n_moves! |  | ||||||
|         # I think this causes memory errors if called by .pipe() |  | ||||||
|         self.moves.initialize_state(state) |  | ||||||
|         nr_class = self.moves.n_moves |  | ||||||
| 
 |  | ||||||
|         cdef ExampleC eg |  | ||||||
|         eg.nr_feat = nr_feat |  | ||||||
|         eg.nr_atom = CONTEXT_SIZE |  | ||||||
|         eg.nr_class = nr_class |  | ||||||
|         eg.features = <FeatureC*>calloc(sizeof(FeatureC), nr_feat) |  | ||||||
|         eg.atoms = <atom_t*>calloc(sizeof(atom_t), CONTEXT_SIZE) |  | ||||||
|         eg.scores = <weight_t*>calloc(sizeof(weight_t), nr_class) |  | ||||||
|         eg.is_valid = <int*>calloc(sizeof(int), nr_class) |  | ||||||
|         cdef int i |  | ||||||
|         while not state.is_final(): |  | ||||||
|             eg.nr_feat = self.model.set_featuresC(eg.atoms, eg.features, state) |  | ||||||
|             self.moves.set_valid(eg.is_valid, state) |  | ||||||
|             self.model.set_scoresC(eg.scores, eg.features, eg.nr_feat) |  | ||||||
| 
 |  | ||||||
|             guess = VecVec.arg_max_if_true(eg.scores, eg.is_valid, eg.nr_class) |  | ||||||
|             if guess < 0: |  | ||||||
|                 return 1 |  | ||||||
| 
 |  | ||||||
|             action = self.moves.c[guess] |  | ||||||
| 
 |  | ||||||
|             action.do(state, action.label) |  | ||||||
|             memset(eg.scores, 0, sizeof(eg.scores[0]) * eg.nr_class) |  | ||||||
|             for i in range(eg.nr_class): |  | ||||||
|                 eg.is_valid[i] = 1 |  | ||||||
|         self.moves.finalize_state(state) |  | ||||||
|         for i in range(length): |  | ||||||
|             tokens[i] = state._sent[i] |  | ||||||
|         del state |  | ||||||
|         free(eg.features) |  | ||||||
|         free(eg.atoms) |  | ||||||
|         free(eg.scores) |  | ||||||
|         free(eg.is_valid) |  | ||||||
|         return 0 |  | ||||||
| 
 |  | ||||||
|     def update(self, Doc tokens, GoldParse gold, itn=0, double drop=0.0): |  | ||||||
|         """ |  | ||||||
|         Update the statistical model. |  | ||||||
| 
 |  | ||||||
|         Arguments: |  | ||||||
|             doc (Doc): |  | ||||||
|                 The example document for the update. |  | ||||||
|             gold (GoldParse): |  | ||||||
|                 The gold-standard annotations, to calculate the loss. |  | ||||||
|         Returns (float): |  | ||||||
|             The loss on this example. |  | ||||||
|         """ |  | ||||||
|         self.moves.preprocess_gold(gold) |  | ||||||
|         cdef StateClass stcls = StateClass.init(tokens.c, tokens.length) |  | ||||||
|         self.moves.initialize_state(stcls.c) |  | ||||||
|         cdef Pool mem = Pool() |  | ||||||
|         cdef Example eg = Example( |  | ||||||
|                 nr_class=self.moves.n_moves, |  | ||||||
|                 nr_atom=CONTEXT_SIZE, |  | ||||||
|                 nr_feat=self.model.nr_feat) |  | ||||||
|         cdef weight_t loss = 0 |  | ||||||
|         cdef Transition action |  | ||||||
|         cdef double dropout_rate = self.cfg.get('dropout', drop) |  | ||||||
|         while not stcls.is_final(): |  | ||||||
|             eg.c.nr_feat = self.model.set_featuresC(eg.c.atoms, eg.c.features, |  | ||||||
|                                                     stcls.c) |  | ||||||
|             dropout(eg.c.features, eg.c.nr_feat, dropout_rate) |  | ||||||
|             self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold) |  | ||||||
|             self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat) |  | ||||||
|             guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class) |  | ||||||
|             self.model.update(eg) |  | ||||||
| 
 |  | ||||||
|             action = self.moves.c[guess] |  | ||||||
|             action.do(stcls.c, action.label) |  | ||||||
|             loss += eg.costs[guess] |  | ||||||
|             eg.fill_scores(0, eg.c.nr_class) |  | ||||||
|             eg.fill_costs(0, eg.c.nr_class) |  | ||||||
|             eg.fill_is_valid(1, eg.c.nr_class) |  | ||||||
| 
 |  | ||||||
|         self.moves.finalize_state(stcls.c) |  | ||||||
|         return loss |  | ||||||
| 
 |  | ||||||
|     def step_through(self, Doc doc, GoldParse gold=None): |  | ||||||
|         """ |  | ||||||
|         Set up a stepwise state, to introspect and control the transition sequence. |  | ||||||
| 
 |  | ||||||
|         Arguments: |  | ||||||
|             doc (Doc): The document to step through. |  | ||||||
|             gold (GoldParse): Optional gold parse |  | ||||||
|         Returns (StepwiseState): |  | ||||||
|             A state object, to step through the annotation process. |  | ||||||
|         """ |  | ||||||
|         return StepwiseState(self, doc, gold=gold) |  | ||||||
| 
 |  | ||||||
|     def from_transition_sequence(self, Doc doc, sequence): |  | ||||||
|         """Control the annotations on a document by specifying a transition sequence |  | ||||||
|         to follow. |  | ||||||
| 
 |  | ||||||
|         Arguments: |  | ||||||
|             doc (Doc): The document to annotate. |  | ||||||
|             sequence: A sequence of action names, as unicode strings. |  | ||||||
|         Returns: None |  | ||||||
|         """ |  | ||||||
|         with self.step_through(doc) as stepwise: |  | ||||||
|             for transition in sequence: |  | ||||||
|                 stepwise.transition(transition) |  | ||||||
| 
 |  | ||||||
|     def add_label(self, label): |  | ||||||
|         # Doesn't set label into serializer -- subclasses override it to do that. |  | ||||||
|         for action in self.moves.action_types: |  | ||||||
|             added = self.moves.add_action(action, label) |  | ||||||
|             if added: |  | ||||||
|                 # Important that the labels be stored as a list! We need the |  | ||||||
|                 # order, or the model goes out of synch |  | ||||||
|                 self.cfg.setdefault('extra_labels', []).append(label) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef int dropout(FeatureC* feats, int nr_feat, float prob) except -1: |  | ||||||
|     if prob <= 0 or prob >= 1.: |  | ||||||
|         return 0 |  | ||||||
|     cdef double[::1] py_probs = numpy.random.uniform(0., 1., nr_feat) |  | ||||||
|     cdef double* probs = &py_probs[0] |  | ||||||
|     for i in range(nr_feat): |  | ||||||
|         if probs[i] >= prob: |  | ||||||
|             feats[i].value /= prob |  | ||||||
|         else: |  | ||||||
|             feats[i].value = 0. |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef class StepwiseState: |  | ||||||
|     cdef readonly StateClass stcls |  | ||||||
|     cdef readonly Example eg |  | ||||||
|     cdef readonly Doc doc |  | ||||||
|     cdef readonly GoldParse gold |  | ||||||
|     cdef readonly Parser parser |  | ||||||
| 
 |  | ||||||
|     def __init__(self, Parser parser, Doc doc, GoldParse gold=None): |  | ||||||
|         self.parser = parser |  | ||||||
|         self.doc = doc |  | ||||||
|         if gold is not None: |  | ||||||
|             self.gold = gold |  | ||||||
|             self.parser.moves.preprocess_gold(self.gold) |  | ||||||
|         else: |  | ||||||
|             self.gold = GoldParse(doc) |  | ||||||
|         self.stcls = StateClass.init(doc.c, doc.length) |  | ||||||
|         self.parser.moves.initialize_state(self.stcls.c) |  | ||||||
|         self.eg = Example( |  | ||||||
|             nr_class=self.parser.moves.n_moves, |  | ||||||
|             nr_atom=CONTEXT_SIZE, |  | ||||||
|             nr_feat=self.parser.model.nr_feat) |  | ||||||
| 
 |  | ||||||
|     def __enter__(self): |  | ||||||
|         return self |  | ||||||
| 
 |  | ||||||
|     def __exit__(self, type, value, traceback): |  | ||||||
|         self.finish() |  | ||||||
| 
 |  | ||||||
|     @property |  | ||||||
|     def is_final(self): |  | ||||||
|         return self.stcls.is_final() |  | ||||||
| 
 |  | ||||||
|     @property |  | ||||||
|     def stack(self): |  | ||||||
|         return self.stcls.stack |  | ||||||
| 
 |  | ||||||
|     @property |  | ||||||
|     def queue(self): |  | ||||||
|         return self.stcls.queue |  | ||||||
| 
 |  | ||||||
|     @property |  | ||||||
|     def heads(self): |  | ||||||
|         return [self.stcls.H(i) for i in range(self.stcls.c.length)] |  | ||||||
| 
 |  | ||||||
|     @property |  | ||||||
|     def deps(self): |  | ||||||
|         return [self.doc.vocab.strings[self.stcls.c._sent[i].dep] |  | ||||||
|                 for i in range(self.stcls.c.length)] |  | ||||||
| 
 |  | ||||||
|     @property |  | ||||||
|     def costs(self): |  | ||||||
|         """ |  | ||||||
|         Find the action-costs for the current state. |  | ||||||
|         """ |  | ||||||
|         if not self.gold: |  | ||||||
|             raise ValueError("Can't set costs: No GoldParse provided") |  | ||||||
|         self.parser.moves.set_costs(self.eg.c.is_valid, self.eg.c.costs, |  | ||||||
|                 self.stcls, self.gold) |  | ||||||
|         costs = {} |  | ||||||
|         for i in range(self.parser.moves.n_moves): |  | ||||||
|             if not self.eg.c.is_valid[i]: |  | ||||||
|                 continue |  | ||||||
|             transition = self.parser.moves.c[i] |  | ||||||
|             name = self.parser.moves.move_name(transition.move, transition.label) |  | ||||||
|             costs[name] = self.eg.c.costs[i] |  | ||||||
|         return costs |  | ||||||
| 
 |  | ||||||
|     def predict(self): |  | ||||||
|         self.eg.reset() |  | ||||||
|         self.eg.c.nr_feat = self.parser.model.set_featuresC(self.eg.c.atoms, self.eg.c.features, |  | ||||||
|                                                             self.stcls.c) |  | ||||||
|         self.parser.moves.set_valid(self.eg.c.is_valid, self.stcls.c) |  | ||||||
|         self.parser.model.set_scoresC(self.eg.c.scores, |  | ||||||
|             self.eg.c.features, self.eg.c.nr_feat) |  | ||||||
| 
 |  | ||||||
|         cdef Transition action = self.parser.moves.c[self.eg.guess] |  | ||||||
|         return self.parser.moves.move_name(action.move, action.label) |  | ||||||
| 
 |  | ||||||
|     def transition(self, action_name=None): |  | ||||||
|         if action_name is None: |  | ||||||
|             action_name = self.predict() |  | ||||||
|         moves = {'S': 0, 'D': 1, 'L': 2, 'R': 3} |  | ||||||
|         if action_name == '_': |  | ||||||
|             action_name = self.predict() |  | ||||||
|             action = self.parser.moves.lookup_transition(action_name) |  | ||||||
|         elif action_name == 'L' or action_name == 'R': |  | ||||||
|             self.predict() |  | ||||||
|             move = moves[action_name] |  | ||||||
|             clas = _arg_max_clas(self.eg.c.scores, move, self.parser.moves.c, |  | ||||||
|                                  self.eg.c.nr_class) |  | ||||||
|             action = self.parser.moves.c[clas] |  | ||||||
|         else: |  | ||||||
|             action = self.parser.moves.lookup_transition(action_name) |  | ||||||
|         action.do(self.stcls.c, action.label) |  | ||||||
| 
 |  | ||||||
|     def finish(self): |  | ||||||
|         if self.stcls.is_final(): |  | ||||||
|             self.parser.moves.finalize_state(self.stcls.c) |  | ||||||
|         self.doc.set_parse(self.stcls.c._sent) |  | ||||||
|         self.parser.moves.finalize_doc(self.doc) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class ParserStateError(ValueError): |  | ||||||
|     def __init__(self, doc): |  | ||||||
|         ValueError.__init__(self, |  | ||||||
|             "Error analysing doc -- no valid actions available. This should " |  | ||||||
|             "never happen, so please report the error on the issue tracker. " |  | ||||||
|             "Here's the thread to do so --- reopen it if it's closed:\n" |  | ||||||
|             "https://github.com/spacy-io/spaCy/issues/429\n" |  | ||||||
|             "Please include the text that the parser failed on, which is:\n" |  | ||||||
|             "%s" % repr(doc.text)) |  | ||||||
| 
 |  | ||||||
| cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs, int n) nogil: |  | ||||||
|     cdef int best = -1 |  | ||||||
|     for i in range(n): |  | ||||||
|         if costs[i] <= 0: |  | ||||||
|             if best == -1 or scores[i] > scores[best]: |  | ||||||
|                 best = i |  | ||||||
|     return best |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions, |  | ||||||
|                        int nr_class) except -1: |  | ||||||
|     cdef weight_t score = 0 |  | ||||||
|     cdef int mode = -1 |  | ||||||
|     cdef int i |  | ||||||
|     for i in range(nr_class): |  | ||||||
|         if actions[i].move == move and (mode == -1 or scores[i] >= score): |  | ||||||
|             mode = i |  | ||||||
|             score = scores[i] |  | ||||||
|     return mode |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user