mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* Update with WIP * Update with WIP * Update with pipeline serialization * Update types and pipe factories * Add deep merge, tidy up and add tests * Fix pipe creation from config * Don't validate default configs on load * Update spacy/language.py Co-authored-by: Ines Montani <ines@ines.io> * Adjust factory/component meta error * Clean up factory args and remove defaults * Add test for failing empty dict defaults * Update pipeline handling and methods * provide KB as registry function instead of as object * small change in test to make functionality more clear * update example script for EL configuration * Fix typo * Simplify test * Simplify test * splitting pipes.pyx into separate files * moving default configs to each component file * fix batch_size type * removing default values from component constructors where possible (TODO: test 4725) * skip instead of xfail * Add test for config -> nlp with multiple instances * pipeline.pipes -> pipeline.pipe * Tidy up, document, remove kwargs * small cleanup/generalization for Tok2VecListener * use DEFAULT_UPSTREAM field * revert to avoid circular imports * Fix tests * Replace deprecated arg * Make model dirs require config * fix pickling of keyword-only arguments in constructor * WIP: clean up and integrate full config * Add helper to handle function args more reliably Now also includes keyword-only args * Fix config composition and serialization * Improve config debugging and add visual diff * Remove unused defaults and fix type * Remove pipeline and factories from meta * Update spacy/default_config.cfg Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/default_config.cfg * small UX edits * avoid printing stack trace for debug CLI commands * Add support for language-specific factories * specify the section of the config which holds the model to debug * WIP: add Language.from_config * Update with language data refactor WIP * Auto-format * Add backwards-compat handling for Language.factories * Update morphologizer.pyx * Fix morphologizer * Update and simplify lemmatizers * Fix Japanese tests * Port over tagger changes * Fix Chinese and tests * Update to latest Thinc * WIP: xfail first Russian lemmatizer test * Fix component-specific overrides * fix nO for output layers in debug_model * Fix default value * Fix tests and don't pass objects in config * Fix deep merging * Fix lemma lookup data registry Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed) * Add types * Add Vocab.from_config * Fix typo * Fix tests * Make config copying more elegant * Fix pipe analysis * Fix lemmatizers and is_base_form * WIP: move language defaults to config * Fix morphology type * Fix vocab * Remove comment * Update to latest Thinc * Add morph rules to config * Tidy up * Remove set_morphology option from tagger factory * Hack use_gpu * Move [pipeline] to top-level block and make [nlp.pipeline] list Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them * Fix use_gpu and resume in CLI * Auto-format * Remove resume from config * Fix formatting and error * [pipeline] -> [components] * Fix types * Fix tagger test: requires set_morphology? Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
		
			
				
	
	
		
			91 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Cython
		
	
	
	
	
	
			
		
		
	
	
			91 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Cython
		
	
	
	
	
	
| # cython: infer_types=True, profile=True, binding=True
 | |
| from typing import Optional, Iterable
 | |
| from thinc.api import CosineDistance, to_categorical, get_array_module, Model, Config
 | |
| 
 | |
| from ..syntax.nn_parser cimport Parser
 | |
| from ..syntax.ner cimport BiluoPushDown
 | |
| 
 | |
| from ..language import Language
 | |
| 
 | |
| 
 | |
| default_model_config = """
 | |
| [model]
 | |
| @architectures = "spacy.TransitionBasedParser.v1"
 | |
| nr_feature_tokens = 6
 | |
| hidden_width = 64
 | |
| maxout_pieces = 2
 | |
| 
 | |
| [model.tok2vec]
 | |
| @architectures = "spacy.HashEmbedCNN.v1"
 | |
| pretrained_vectors = null
 | |
| width = 96
 | |
| depth = 4
 | |
| embed_size = 2000
 | |
| window_size = 1
 | |
| maxout_pieces = 3
 | |
| subword_features = true
 | |
| dropout = null
 | |
| """
 | |
| DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
 | |
| 
 | |
| 
 | |
| @Language.factory(
 | |
|     "ner",
 | |
|     assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
 | |
|     default_config={
 | |
|         "moves": None,
 | |
|         "update_with_oracle_cut_size": 100,
 | |
|         "multitasks": [],
 | |
|         "learn_tokens": False,
 | |
|         "min_action_freq": 30,
 | |
|         "model": DEFAULT_NER_MODEL,
 | |
|     }
 | |
| )
 | |
| def make_ner(
 | |
|     nlp: Language,
 | |
|     name: str,
 | |
|     model: Model,
 | |
|     moves: Optional[list],
 | |
|     update_with_oracle_cut_size: int,
 | |
|     multitasks: Iterable,
 | |
|     learn_tokens: bool,
 | |
|     min_action_freq: int
 | |
| ):
 | |
|     return EntityRecognizer(
 | |
|         nlp.vocab,
 | |
|         model,
 | |
|         name,
 | |
|         moves=moves,
 | |
|         update_with_oracle_cut_size=update_with_oracle_cut_size,
 | |
|         multitasks=multitasks,
 | |
|         learn_tokens=learn_tokens,
 | |
|         min_action_freq=min_action_freq
 | |
|     )
 | |
| 
 | |
| 
 | |
| cdef class EntityRecognizer(Parser):
 | |
|     """Pipeline component for named entity recognition.
 | |
| 
 | |
|     DOCS: https://spacy.io/api/entityrecognizer
 | |
|     """
 | |
|     TransitionSystem = BiluoPushDown
 | |
| 
 | |
|     def add_multitask_objective(self, mt_component):
 | |
|         self._multitasks.append(mt_component)
 | |
| 
 | |
|     def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
 | |
|         # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
 | |
|         for labeller in self._multitasks:
 | |
|             labeller.model.set_dim("nO", len(self.labels))
 | |
|             if labeller.model.has_ref("output_layer"):
 | |
|                 labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
 | |
|             labeller.begin_training(get_examples, pipeline=pipeline)
 | |
| 
 | |
|     @property
 | |
|     def labels(self):
 | |
|         # Get the labels from the model by looking at the available moves, e.g.
 | |
|         # B-PERSON, I-PERSON, L-PERSON, U-PERSON
 | |
|         labels = set(move.split("-")[1] for move in self.move_names
 | |
|                      if move[0] in ("B", "I", "L", "U"))
 | |
|         return tuple(sorted(labels))
 |