mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			203 lines
		
	
	
		
			9.1 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			203 lines
		
	
	
		
			9.1 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
include ../../_includes/_mixins
 | 
						||
 | 
						||
p
 | 
						||
    |  This guide describes how to train new statistical models for spaCy's
 | 
						||
    |  part-of-speech tagger, named entity recognizer and dependency parser.
 | 
						||
    |  Once the model is trained, you can then
 | 
						||
    |  #[+a("/docs/usage/saving-loading") save and load] it.
 | 
						||
 | 
						||
+h(2, "101") Training 101
 | 
						||
 | 
						||
include _spacy-101/_training
 | 
						||
 | 
						||
+h(3, "training-data") How do I get training data?
 | 
						||
 | 
						||
p
 | 
						||
    |  Collecting training data may sound incredibly painful – and it can be,
 | 
						||
    |  if you're planning a large-scale annotation project. However, if your main
 | 
						||
    |  goal is to update an existing model's predictions – for example, spaCy's
 | 
						||
    |  named entity recognition – the hard is part usually not creating the
 | 
						||
    |  actual annotations. It's finding representative examples and
 | 
						||
    |  #[strong extracting potential candidates]. The good news is, if you've
 | 
						||
    |  been noticing bad performance on your data, you likely
 | 
						||
    |  already have some relevant text, and you can use spaCy to
 | 
						||
    |  #[strong bootstrap a first set of training examples]. For example,
 | 
						||
    |  after processing a few sentences, you may end up with the following
 | 
						||
    |  entities, some correct, some incorrect.
 | 
						||
 | 
						||
+aside("How many examples do I need?")
 | 
						||
    |  As a rule of thumb, you should allocate at least 10% of your project
 | 
						||
    |  resources to creating training and evaluation data. If you're looking to
 | 
						||
    |  improve an existing model, you might be able to start off with only a
 | 
						||
    |  handful of examples. Keep in mind that you'll always want a lot more than
 | 
						||
    |  that for #[strong evaluation] – especially previous errors the model has
 | 
						||
    |  made. Otherwise, you won't be able to sufficiently verify that the model
 | 
						||
    |  has actually made the #[strong correct generalisations] required for your
 | 
						||
    |  use case.
 | 
						||
 | 
						||
+table(["Text", "Entity", "Start", "End", "Label", ""])
 | 
						||
    - var style = [0, 0, 1, 1, 1]
 | 
						||
    +annotation-row(["Uber blew through $1 million a week", "Uber", 0, 4, "ORG"], style)
 | 
						||
        +cell #[+procon("pro")]
 | 
						||
    +annotation-row(["Android Pay expands to Canada", "Android", 0, 7, "PERSON"], style)
 | 
						||
        +cell #[+procon("con")]
 | 
						||
    +annotation-row(["Android Pay expands to Canada", "Canada", 23, 30, "GPE"], style)
 | 
						||
        +cell #[+procon("pro")]
 | 
						||
    +annotation-row(["Spotify steps up Asia expansion", "Spotify", 0, 8, "ORG"], style)
 | 
						||
        +cell #[+procon("pro")]
 | 
						||
    +annotation-row(["Spotify steps up Asia expansion", "Asia", 17, 21, "NORP"], style)
 | 
						||
        +cell #[+procon("con")]
 | 
						||
 | 
						||
p
 | 
						||
    |  Alternatively, the
 | 
						||
    |  #[+a("/docs/usage/rule-based-matching#example3") rule-based matcher]
 | 
						||
    |  can be a useful tool to extract tokens or combinations of tokens, as
 | 
						||
    |  well as their start and end index in a document. In this case, we'll
 | 
						||
    |  extract mentions of Google and assume they're an #[code ORG].
 | 
						||
 | 
						||
+table(["Text", "Entity", "Start", "End", "Label", ""])
 | 
						||
    - var style = [0, 0, 1, 1, 1]
 | 
						||
    +annotation-row(["let me google this for you", "google", 7, 13, "ORG"], style)
 | 
						||
        +cell #[+procon("con")]
 | 
						||
    +annotation-row(["Google Maps launches location sharing", "Google", 0, 6, "ORG"], style)
 | 
						||
        +cell #[+procon("con")]
 | 
						||
    +annotation-row(["Google rebrands its business apps", "Google", 0, 6, "ORG"], style)
 | 
						||
        +cell #[+procon("pro")]
 | 
						||
    +annotation-row(["look what i found on google! 😂", "google", 21, 27, "ORG"], style)
 | 
						||
        +cell #[+procon("con")]
 | 
						||
 | 
						||
p
 | 
						||
    |  Based on the few examples above, you can already create six training
 | 
						||
    |  sentences with eight entities in total. Of course, what you consider a
 | 
						||
    |  "correct annotation" will always depend on
 | 
						||
    |  #[strong what you want the model to learn]. While there are some entity
 | 
						||
    |  annotations that are more or less universally correct – like Canada being
 | 
						||
    |  a geopolitical entity – your application may have its very own definition
 | 
						||
    |  of the #[+a("/docs/api/annotation#named-entities") NER annotation scheme].
 | 
						||
 | 
						||
+code.
 | 
						||
    train_data = [
 | 
						||
        ("Uber blew through $1 million a week", [(0, 4, 'ORG')]),
 | 
						||
        ("Android Pay expands to Canada", [(0, 11, 'PRODUCT'), (23, 30, 'GPE')]),
 | 
						||
        ("Spotify steps up Asia expansion", [(0, 8, "ORG"), (17, 21, "LOC")]),
 | 
						||
        ("Google Maps launches location sharing", [(0, 11, "PRODUCT")]),
 | 
						||
        ("Google rebrands its business apps", [(0, 6, "ORG")]),
 | 
						||
        ("look what i found on google! 😂", [(21, 27, "PRODUCT")])]
 | 
						||
 | 
						||
+h(2) Training with annotations
 | 
						||
 | 
						||
p
 | 
						||
    |  The #[+api("goldparse") #[code GoldParse]] object collects the annotated
 | 
						||
    |  training examples, also called the #[strong gold standard]. It's
 | 
						||
    |  initialised with the #[+api("doc") #[code Doc]] object it refers to,
 | 
						||
    |  and keyword arguments specifying the annotations, like #[code tags]
 | 
						||
    |  or #[code entities]. Its job is to encode the annotations, keep them
 | 
						||
    |  aligned and create the C-level data structures required for efficient access.
 | 
						||
    |  Here's an example of a simple #[code GoldParse] for part-of-speech tags:
 | 
						||
 | 
						||
+code.
 | 
						||
    vocab = Vocab(tag_map={'N': {'pos': 'NOUN'}, 'V': {'pos': 'VERB'}})
 | 
						||
    doc = Doc(vocab, words=['I', 'like', 'stuff'])
 | 
						||
    gold = GoldParse(doc, tags=['N', 'V', 'N'])
 | 
						||
 | 
						||
p
 | 
						||
    |  Using the #[code Doc] and its gold-standard annotations, the model can be
 | 
						||
    |  updated to learn a sentence of three words with their assigned
 | 
						||
    |  part-of-speech tags. The #[+a("/docs/usage/adding-languages#tag-map") tag map]
 | 
						||
    |  is part of the vocabulary and defines the annotation scheme. If you're
 | 
						||
    |  training a new language model, this will let you map the tags present in
 | 
						||
    |  the treebank you train on to spaCy's tag scheme.
 | 
						||
 | 
						||
+code.
 | 
						||
    doc = Doc(Vocab(), words=['Facebook', 'released', 'React', 'in', '2014'])
 | 
						||
    gold = GoldParse(doc, entities=['U-ORG', 'O', 'U-TECHNOLOGY', 'O', 'U-DATE'])
 | 
						||
 | 
						||
p
 | 
						||
    |  The same goes for named entities. The letters added before the labels
 | 
						||
    |  refer to the tags of the
 | 
						||
    |  #[+a("/docs/usage/entity-recognition#updating-biluo") BILUO scheme] –
 | 
						||
    |  #[code O] is a token outside an entity, #[code U] an single entity unit,
 | 
						||
    |  #[code B] the beginning of an entity, #[code I] a token inside an entity
 | 
						||
    |  and #[code L] the last token of an entity.
 | 
						||
 | 
						||
+aside
 | 
						||
    |  #[strong Training data]: The training examples.#[br]
 | 
						||
    |  #[strong Text and label]: The current example.#[br]
 | 
						||
    |  #[strong Doc]: A #[code Doc] object created from the example text.#[br]
 | 
						||
    |  #[strong GoldParse]: A #[code GoldParse] object of the #[code Doc] and label.#[br]
 | 
						||
    |  #[strong nlp]: The #[code nlp] object with the model.#[br]
 | 
						||
    |  #[strong Optimizer]: A function that holds state between updates.#[br]
 | 
						||
    |  #[strong Update]: Update the model's weights.#[br]
 | 
						||
    |  #[strong ]
 | 
						||
 | 
						||
+image
 | 
						||
    include ../../assets/img/docs/training-loop.svg
 | 
						||
    .u-text-right
 | 
						||
        +button("/assets/img/docs/training-loop.svg", false, "secondary").u-text-tag View large graphic
 | 
						||
 | 
						||
p
 | 
						||
    |  Of course, it's not enough to only show a model a single example once.
 | 
						||
    |  Especially if you only have few examples, you'll want to train for a
 | 
						||
    |  #[strong number of iterations]. At each iteration, the training data is
 | 
						||
    |  #[strong shuffled] to ensure the model doesn't make any generalisations
 | 
						||
    |  based on the order of examples. Another technique to improve the learning
 | 
						||
    |  results is to set a #[strong dropout rate], a rate at which to randomly
 | 
						||
    |  "drop" individual features and representations. This makes it harder for
 | 
						||
    |  the model to memorise the training data. For example, a #[code 0.25]
 | 
						||
    |  dropout means that each feature or internal representation has a 1/4
 | 
						||
    |  likelihood of being dropped.
 | 
						||
 | 
						||
+aside
 | 
						||
    |  #[+api("language#begin_training") #[code begin_training()]]: Start the
 | 
						||
    |  training and return an optimizer function to update the model's weights.#[br]
 | 
						||
    |  #[+api("language#update") #[code update()]]: Update the model with the
 | 
						||
    |  training example and gold data.#[br]
 | 
						||
    |  #[+api("language#to_disk") #[code to_disk()]]: Save the updated model to
 | 
						||
    |  a directory.
 | 
						||
 | 
						||
+code("Example training loop").
 | 
						||
    optimizer = nlp.begin_training(get_data)
 | 
						||
    for itn in range(100):
 | 
						||
        random.shuffle(train_data)
 | 
						||
        for raw_text, entity_offsets in train_data:
 | 
						||
            doc = nlp.make_doc(raw_text)
 | 
						||
            gold = GoldParse(doc, entities=entity_offsets)
 | 
						||
            nlp.update([doc], [gold], drop=0.5, sgd=optimizer)
 | 
						||
    nlp.to_disk('/model')
 | 
						||
 | 
						||
+table(["Name", "Description"])
 | 
						||
    +row
 | 
						||
        +cell #[code train_data]
 | 
						||
        +cell The training data.
 | 
						||
 | 
						||
    +row
 | 
						||
        +cell #[code get_data]
 | 
						||
        +cell A function converting the training data to spaCy's JSON format.
 | 
						||
 | 
						||
    +row
 | 
						||
        +cell #[code doc]
 | 
						||
        +cell #[+api("doc") #[code Doc]] objects.
 | 
						||
 | 
						||
    +row
 | 
						||
        +cell #[code gold]
 | 
						||
        +cell #[+api("goldparse") #[code GoldParse]] objects.
 | 
						||
 | 
						||
    +row
 | 
						||
        +cell #[code drop]
 | 
						||
        +cell Dropout rate. Makes it harder for the model to just memorise the data.
 | 
						||
 | 
						||
    +row
 | 
						||
        +cell #[code optimizer]
 | 
						||
        +cell Callable to update the model's weights.
 | 
						||
 | 
						||
+infobox
 | 
						||
    |  For the #[strong full example and more details], see the usage guide on
 | 
						||
    |  #[+a("/docs/usage/training-ner") training the named entity recognizer],
 | 
						||
    |  or the runnable
 | 
						||
    |  #[+src(gh("spaCy", "examples/training/train_ner.py")) training script]
 | 
						||
    |  on GitHub.
 | 
						||
 | 
						||
+h(2) Examples
 | 
						||
 | 
						||
+under-construction
 |