mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Checkpoint -- nearly finished reimpl
This commit is contained in:
		
							parent
							
								
									4441866f55
								
							
						
					
					
						commit
						35458987e8
					
				| 
						 | 
					@ -28,6 +28,8 @@ from murmurhash.mrmr cimport hash64
 | 
				
			||||||
from preshed.maps cimport MapStruct
 | 
					from preshed.maps cimport MapStruct
 | 
				
			||||||
from preshed.maps cimport map_get
 | 
					from preshed.maps cimport map_get
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from thinc.api import layerize
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from numpy import exp
 | 
					from numpy import exp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from . import _parse_features
 | 
					from . import _parse_features
 | 
				
			||||||
| 
						 | 
					@ -55,40 +57,45 @@ def set_debug(val):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_greedy_model_for_batch(tokvecs, TransitionSystem moves, feat_maps, upper_model):
 | 
					def get_greedy_model_for_batch(tokvecs, TransitionSystem moves, feat_maps, upper_model):
 | 
				
			||||||
    is_valid = model.ops.allocate((len(docs), system.n_moves), dtype='i')
 | 
					    cdef int[:, :] is_valid_
 | 
				
			||||||
    costs = model.ops.allocate((len(docs), system.n_moves), dtype='f')
 | 
					    cdef float[:, :] costs_
 | 
				
			||||||
    token_ids = model.ops.allocate((len(docs), StateClass.nr_context_tokens()),
 | 
					    cdef int[:, :] token_ids
 | 
				
			||||||
 | 
					    is_valid = upper_model.ops.allocate((len(tokvecs), moves.n_moves), dtype='i')
 | 
				
			||||||
 | 
					    costs = upper_model.ops.allocate((len(tokvecs), moves.n_moves), dtype='f')
 | 
				
			||||||
 | 
					    token_ids = upper_model.ops.allocate((len(tokvecs), StateClass.nr_context_tokens()),
 | 
				
			||||||
                                         dtype='uint64')
 | 
					                                         dtype='uint64')
 | 
				
			||||||
    cached, backprops = zip(*[lyr.begin_update(tokvecs) for lyr in feat_maps)
 | 
					    cached, backprops = zip(*[lyr.begin_update(tokvecs) for lyr in feat_maps])
 | 
				
			||||||
 | 
					    is_valid_ = is_valid
 | 
				
			||||||
 | 
					    costs_ = costs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def forward(states, drop=0.):
 | 
					    def forward(states, drop=0.):
 | 
				
			||||||
        nonlocal is_valid, costs, token_ids, features
 | 
					        nonlocal is_valid, costs, token_ids, moves
 | 
				
			||||||
        is_valid = is_valid[:len(states)]
 | 
					        is_valid = is_valid[:len(states)]
 | 
				
			||||||
        costs = costs[:len(states)]
 | 
					        costs = costs[:len(states)]
 | 
				
			||||||
        token_ids = token_ids[:len(states)]
 | 
					        token_ids = token_ids[:len(states)]
 | 
				
			||||||
        is_valid = is_valid[:len(states)]
 | 
					        is_valid = is_valid[:len(states)]
 | 
				
			||||||
        for state in states:
 | 
					        cdef StateClass state
 | 
				
			||||||
            state.set_context_tokens(&token_ids[i])
 | 
					        for i, state in enumerate(states):
 | 
				
			||||||
            moves.set_valid(&is_valid[i], state.c)
 | 
					            state.set_context_tokens(token_ids[i])
 | 
				
			||||||
 | 
					            moves.set_valid(&is_valid_[i, 0], state.c)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        features = cached[token_ids].sum(axis=1)
 | 
					        features = cached[token_ids].sum(axis=1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        scores, bp_scores = upper_model.begin_update(features, drop=drop)
 | 
					        scores, bp_scores = upper_model.begin_update(features, drop=drop)
 | 
				
			||||||
        softmaxed = model.ops.softmax(scores)
 | 
					        softmaxed = upper_model.ops.softmax(scores)
 | 
				
			||||||
        # Renormalize for invalid actions
 | 
					        # Renormalize for invalid actions
 | 
				
			||||||
        softmaxed *= is_valid
 | 
					        softmaxed *= is_valid
 | 
				
			||||||
        softmaxed /= softmaxed.sum(axis=1).reshape((softmaxed.shape[0], 1))
 | 
					        softmaxed /= softmaxed.sum(axis=1).reshape((softmaxed.shape[0], 1))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        def backward(golds, sgd=None):
 | 
					        def backward(golds, sgd=None):
 | 
				
			||||||
            nonlocal costs_, is_valid_, moves_
 | 
					            nonlocal costs_, is_valid_, moves
 | 
				
			||||||
            cdef TransitionSystem moves = moves_
 | 
					 | 
				
			||||||
            cdef int[:, :] is_valid
 | 
					 | 
				
			||||||
            cdef float[:, :] costs
 | 
					 | 
				
			||||||
            for i, (state, gold) in enumerate(zip(states, golds)):
 | 
					            for i, (state, gold) in enumerate(zip(states, golds)):
 | 
				
			||||||
                moves.set_costs(&costs[i], &is_valid[i],
 | 
					                moves.set_costs(&is_valid_[i, 0], &costs_[i, 0],
 | 
				
			||||||
                    state, gold)
 | 
					                    state, gold)
 | 
				
			||||||
            set_log_loss(model.ops, d_scores,
 | 
					            d_scores = scores.copy()
 | 
				
			||||||
                scores, is_valid, costs)
 | 
					            d_scores.fill(0)
 | 
				
			||||||
 | 
					            set_log_loss(upper_model.ops, d_scores,
 | 
				
			||||||
 | 
					                scores, is_valid_, costs_)
 | 
				
			||||||
            d_tokens = bp_scores(d_scores, sgd)
 | 
					            d_tokens = bp_scores(d_scores, sgd)
 | 
				
			||||||
            return d_tokens
 | 
					            return d_tokens
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -119,6 +126,17 @@ def transition_batch(TransitionSystem moves, states, scores):
 | 
				
			||||||
        action.do(state.c, action.label)
 | 
					        action.do(state.c, action.label)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def init_states(TransitionSystem moves, docs):
 | 
				
			||||||
 | 
					    states = []
 | 
				
			||||||
 | 
					    cdef Doc doc
 | 
				
			||||||
 | 
					    cdef StateClass state
 | 
				
			||||||
 | 
					    for i, doc in enumerate(docs):
 | 
				
			||||||
 | 
					        state = StateClass.init(doc.c, doc.length)
 | 
				
			||||||
 | 
					        moves.initialize_state(state.c)
 | 
				
			||||||
 | 
					        states.append(state)
 | 
				
			||||||
 | 
					    return states
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Parser:
 | 
					cdef class Parser:
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Base class of the DependencyParser and EntityRecognizer.
 | 
					    Base class of the DependencyParser and EntityRecognizer.
 | 
				
			||||||
| 
						 | 
					@ -176,7 +194,8 @@ cdef class Parser:
 | 
				
			||||||
    def build_model(self, width=32, nr_vector=1000, nF=1, nB=1, nS=1, nL=1, nR=1, **_):
 | 
					    def build_model(self, width=32, nr_vector=1000, nF=1, nB=1, nS=1, nL=1, nR=1, **_):
 | 
				
			||||||
        nr_context_tokens = StateClass.nr_context_tokens(nF, nB, nS, nL, nR)
 | 
					        nr_context_tokens = StateClass.nr_context_tokens(nF, nB, nS, nL, nR)
 | 
				
			||||||
        self.model = build_model(width*2, 2, self.moves.n_moves)
 | 
					        self.model = build_model(width*2, 2, self.moves.n_moves)
 | 
				
			||||||
        self.feature_maps = build_feature_maps(nr_context_tokens, width, nr_vector))
 | 
					        # TODO
 | 
				
			||||||
 | 
					        self.feature_maps = [] #build_feature_maps(nr_context_tokens, width, nr_vector)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, Doc tokens):
 | 
					    def __call__(self, Doc tokens):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
| 
						 | 
					@ -248,6 +267,7 @@ cdef class Parser:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        model = get_greedy_model_for_batch([d.tensor for d in docs],
 | 
					        model = get_greedy_model_for_batch([d.tensor for d in docs],
 | 
				
			||||||
                    self.moves, self.model, self.feat_maps)
 | 
					                    self.moves, self.model, self.feat_maps)
 | 
				
			||||||
 | 
					        states = init_states(self.moves, docs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        d_tokens = [self.model.ops.allocate(d.tensor.shape) for d in docs]
 | 
					        d_tokens = [self.model.ops.allocate(d.tensor.shape) for d in docs]
 | 
				
			||||||
        output = list(d_tokens)
 | 
					        output = list(d_tokens)
 | 
				
			||||||
| 
						 | 
					@ -261,7 +281,7 @@ cdef class Parser:
 | 
				
			||||||
            transition_batch(self.moves, states)
 | 
					            transition_batch(self.moves, states)
 | 
				
			||||||
            # Get unfinished states (and their matching gold and token gradients)
 | 
					            # Get unfinished states (and their matching gold and token gradients)
 | 
				
			||||||
            todo = filter(lambda sp: not sp[0].py_is_final(), todo)
 | 
					            todo = filter(lambda sp: not sp[0].py_is_final(), todo)
 | 
				
			||||||
        return output, sum(losses)
 | 
					        return output
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def begin_training(self, docs, golds):
 | 
					    def begin_training(self, docs, golds):
 | 
				
			||||||
        for gold in golds:
 | 
					        for gold in golds:
 | 
				
			||||||
| 
						 | 
					@ -336,31 +356,6 @@ def _begin_update(self, model, states, tokvecs, drop=0.):
 | 
				
			||||||
        return finish_update(d_scores, sgd=sgd)
 | 
					        return finish_update(d_scores, sgd=sgd)
 | 
				
			||||||
    return softmaxed, backward
 | 
					    return softmaxed, backward
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _init_states(self, docs):
 | 
					 | 
				
			||||||
    states = []
 | 
					 | 
				
			||||||
    cdef Doc doc
 | 
					 | 
				
			||||||
    cdef StateClass state
 | 
					 | 
				
			||||||
    for i, doc in enumerate(docs):
 | 
					 | 
				
			||||||
        state = StateClass.init(doc.c, doc.length)
 | 
					 | 
				
			||||||
        self.moves.initialize_state(state.c)
 | 
					 | 
				
			||||||
        states.append(state)
 | 
					 | 
				
			||||||
    return states
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _validate_batch(self, int[:, ::1] is_valid, states):
 | 
					 | 
				
			||||||
    cdef StateClass state
 | 
					 | 
				
			||||||
    cdef int i
 | 
					 | 
				
			||||||
    for i, state in enumerate(states):
 | 
					 | 
				
			||||||
        self.moves.set_valid(&is_valid[i, 0], state.c)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _cost_batch(self, weight_t[:, ::1] costs, int[:, ::1] is_valid,
 | 
					 | 
				
			||||||
        states, golds):
 | 
					 | 
				
			||||||
    cdef int i
 | 
					 | 
				
			||||||
    cdef StateClass state
 | 
					 | 
				
			||||||
    cdef GoldParse gold
 | 
					 | 
				
			||||||
    for i, (state, gold) in enumerate(zip(states, golds)):
 | 
					 | 
				
			||||||
        self.moves.set_costs(&is_valid[i, 0], &costs[i, 0], state, gold)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _get_features(self, states, all_tokvecs, attr_names,
 | 
					def _get_features(self, states, all_tokvecs, attr_names,
 | 
				
			||||||
        nF=1, nB=0, nS=2, nL=2, nR=2):
 | 
					        nF=1, nB=0, nS=2, nL=2, nR=2):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user