mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			115 lines
		
	
	
		
			4.6 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			115 lines
		
	
	
		
			4.6 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
include ../../_includes/_mixins
 | 
						|
 | 
						|
p
 | 
						|
    |  All #[+a("/docs/usage/models") spaCy models] support online learning, so
 | 
						|
    |  you can update a pre-trained model with new examples. You can even add
 | 
						|
    |  new classes to an existing model, to recognise a new entity type,
 | 
						|
    |  part-of-speech, or syntactic relation. Updating an existing model is
 | 
						|
    |  particularly useful as a "quick and dirty solution", if you have only a
 | 
						|
    |  few corrections or annotations.
 | 
						|
 | 
						|
+h(2, "improving-accuracy") Improving accuracy on existing entity types
 | 
						|
 | 
						|
p
 | 
						|
    |  To update the model, you first need to create an instance of
 | 
						|
    |  #[+api("goldparse") #[code GoldParse]], with the entity labels
 | 
						|
    |  you want to learn. You'll usually need to provide many examples to
 | 
						|
    |  meaningfully improve the system — a few hundred is a good start, although
 | 
						|
    |  more is better.
 | 
						|
 | 
						|
+image
 | 
						|
    include ../../assets/img/docs/training-loop.svg
 | 
						|
    .u-text-right
 | 
						|
        +button("/assets/img/docs/training-loop.svg", false, "secondary").u-text-tag View large graphic
 | 
						|
 | 
						|
p
 | 
						|
    |  You should avoid iterating over the same few examples multiple times, or
 | 
						|
    |  the model is likely to "forget" how to annotate other examples. If you
 | 
						|
    |  iterate over the same few examples, you're effectively changing the loss
 | 
						|
    |  function. The optimizer will find a way to minimize the loss on your
 | 
						|
    |  examples, without regard for the consequences on the examples it's no
 | 
						|
    |  longer paying attention to.
 | 
						|
 | 
						|
p
 | 
						|
    |  One way to avoid this "catastrophic forgetting" problem is to "remind"
 | 
						|
    |  the model of other examples by augmenting your annotations with sentences
 | 
						|
    |  annotated with entities automatically recognised by the original model.
 | 
						|
    |  Ultimately, this is an empirical process: you'll need to
 | 
						|
    |  #[strong experiment on your own data] to find a solution that works best
 | 
						|
    |  for you.
 | 
						|
 | 
						|
+h(2, "example") Example
 | 
						|
 | 
						|
+under-construction
 | 
						|
 | 
						|
+code.
 | 
						|
    import random
 | 
						|
    from spacy.lang.en import English
 | 
						|
    from spacy.gold import GoldParse, biluo_tags_from_offsets
 | 
						|
 | 
						|
    def main(model_dir=None):
 | 
						|
        train_data = [
 | 
						|
            ('Who is Shaka Khan?',
 | 
						|
                [(len('Who is '), len('Who is Shaka Khan'), 'PERSON')]),
 | 
						|
            ('I like London and Berlin.',
 | 
						|
                [(len('I like '), len('I like London'), 'LOC'),
 | 
						|
                (len('I like London and '), len('I like London and Berlin'), 'LOC')])
 | 
						|
        ]
 | 
						|
        nlp = English(pipeline=['tensorizer', 'ner'])
 | 
						|
        get_data = lambda: reformat_train_data(nlp.tokenizer, train_data)
 | 
						|
        optimizer = nlp.begin_training(get_data)
 | 
						|
        for itn in range(100):
 | 
						|
            random.shuffle(train_data)
 | 
						|
            losses = {}
 | 
						|
            for raw_text, entity_offsets in train_data:
 | 
						|
                doc = nlp.make_doc(raw_text)
 | 
						|
                gold = GoldParse(doc, entities=entity_offsets)
 | 
						|
                nlp.update([doc], [gold], drop=0.5, sgd=optimizer, losses=losses)
 | 
						|
        nlp.to_disk(model_dir)
 | 
						|
 | 
						|
+code.
 | 
						|
    def reformat_train_data(tokenizer, examples):
 | 
						|
        """Reformat data to match JSON format"""
 | 
						|
        output = []
 | 
						|
        for i, (text, entity_offsets) in enumerate(examples):
 | 
						|
            doc = tokenizer(text)
 | 
						|
            ner_tags = biluo_tags_from_offsets(tokenizer(text), entity_offsets)
 | 
						|
            words = [w.text for w in doc]
 | 
						|
            tags = ['-'] * len(doc)
 | 
						|
            heads = [0] * len(doc)
 | 
						|
            deps = [''] * len(doc)
 | 
						|
            sentence = (range(len(doc)), words, tags, heads, deps, ner_tags)
 | 
						|
            output.append((text, [(sentence, [])]))
 | 
						|
        return output
 | 
						|
 | 
						|
p.u-text-right
 | 
						|
    +button(gh("spaCy", "examples/training/train_ner.py"), false, "secondary").u-text-tag View full example
 | 
						|
 | 
						|
+h(2, "saving-loading") Saving and loading
 | 
						|
 | 
						|
p
 | 
						|
    |  After training our model, you'll usually want to save its state, and load
 | 
						|
    |  it back later. You can do this with the
 | 
						|
    |  #[+api("language#to_disk") #[code Language.to_disk()]] method:
 | 
						|
 | 
						|
+code.
 | 
						|
    nlp.to_disk('/home/me/data/en_technology')
 | 
						|
 | 
						|
p
 | 
						|
    |  To make the model more convenient to deploy, we recommend wrapping it as
 | 
						|
    |  a Python package, so that you can install it via pip and load it as a
 | 
						|
    |  module. spaCy comes with a handy #[+api("cli#package") #[code package]]
 | 
						|
    |  CLI command to create all required files and directories.
 | 
						|
 | 
						|
+code(false, "bash").
 | 
						|
    python -m spacy package /home/me/data/en_technology /home/me/my_models
 | 
						|
 | 
						|
p
 | 
						|
    |  To build the package and create a #[code .tar.gz] archive, run
 | 
						|
    |  #[code python setup.py sdist] from within its directory.
 | 
						|
 | 
						|
+infobox("Saving and loading models")
 | 
						|
    |  For more information and a detailed guide on how to package your model,
 | 
						|
    |  see the documentation on
 | 
						|
    |  #[+a("/docs/usage/saving-loading#models") saving and loading models].
 |