mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Update training examples to use "simple style"
This commit is contained in:
		
							parent
							
								
									906aece532
								
							
						
					
					
						commit
						fe498b3d5e
					
				|  | @ -14,55 +14,49 @@ following types of relations: ROOT, PLACE, QUALITY, ATTRIBUTE, TIME, LOCATION. | ||||||
| ('best', 'QUALITY', 'hotel') --> hotel with QUALITY best | ('best', 'QUALITY', 'hotel') --> hotel with QUALITY best | ||||||
| ('hotel', 'PLACE', 'show') --> show PLACE hotel | ('hotel', 'PLACE', 'show') --> show PLACE hotel | ||||||
| ('berlin', 'LOCATION', 'hotel') --> hotel with LOCATION berlin | ('berlin', 'LOCATION', 'hotel') --> hotel with LOCATION berlin | ||||||
|  | 
 | ||||||
|  | Developed for: spaCy 2.0.0a18 | ||||||
|  | Last updated for: spaCy 2.0.0a19 | ||||||
| """ | """ | ||||||
| from __future__ import unicode_literals, print_function | from __future__ import unicode_literals, print_function | ||||||
| 
 | 
 | ||||||
| import plac | import plac | ||||||
| import random | import random | ||||||
| import spacy | import spacy | ||||||
| from spacy.gold import GoldParse |  | ||||||
| from spacy.tokens import Doc |  | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # training data: words, head and dependency labels | # training data: texts, heads and dependency labels | ||||||
| # for no relation, we simply chose an arbitrary dependency label, e.g. '-' | # for no relation, we simply chose an arbitrary dependency label, e.g. '-' | ||||||
| TRAIN_DATA = [ | TRAIN_DATA = [ | ||||||
|     ( |     ("find a cafe with great wifi", { | ||||||
|         ['find', 'a', 'cafe', 'with', 'great', 'wifi'], |         'heads': [0, 2, 0, 5, 5, 2],  # index of token head | ||||||
|         [0, 2, 0, 5, 5, 2],  # index of token head |         'deps': ['ROOT', '-', 'PLACE', '-', 'QUALITY', 'ATTRIBUTE'] | ||||||
|         ['ROOT', '-', 'PLACE', '-', 'QUALITY', 'ATTRIBUTE'] |     }), | ||||||
|     ), |     ("find a hotel near the beach", { | ||||||
|     ( |         'heads': [0, 2, 0, 5, 5, 2], | ||||||
|         ['find', 'a', 'hotel', 'near', 'the', 'beach'], |         'deps': ['ROOT', '-', 'PLACE', 'QUALITY', '-', 'ATTRIBUTE'] | ||||||
|         [0, 2, 0, 5, 5, 2], |     }), | ||||||
|         ['ROOT', '-', 'PLACE', 'QUALITY', '-', 'ATTRIBUTE'] |     ("find me the closest gym that's open late", { | ||||||
|     ), |         'heads': [0, 0, 4, 4, 0, 6, 4, 6, 6], | ||||||
|     ( |         'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'ATTRIBUTE', 'TIME'] | ||||||
|         ['find', 'me', 'the', 'closest', 'gym', 'that', "'s", 'open', 'late'], |     }), | ||||||
|         [0, 0, 4, 4, 0, 6, 4, 6, 6], |     ("show me the cheapest store that sells flowers", { | ||||||
|         ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'ATTRIBUTE', 'TIME'] |         'heads': [0, 0, 4, 4, 0, 4, 4, 4],  # attach "flowers" to store! | ||||||
|     ), |         'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'PRODUCT'] | ||||||
|     ( |     }), | ||||||
|         ['show', 'me', 'the', 'cheapest', 'store', 'that', 'sells', 'flowers'], |     ("find a nice restaurant in london", { | ||||||
|         [0, 0, 4, 4, 0, 4, 4, 4],  # attach "flowers" to store! |         'heads': [0, 3, 3, 0, 3, 3], | ||||||
|         ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'PRODUCT'] |         'deps': ['ROOT', '-', 'QUALITY', 'PLACE', '-', 'LOCATION'] | ||||||
|     ), |     }), | ||||||
|     ( |     ("show me the coolest hostel in berlin", { | ||||||
|         ['find', 'a', 'nice', 'restaurant', 'in', 'london'], |         'heads': [0, 0, 4, 4, 0, 4, 4], | ||||||
|         [0, 3, 3, 0, 3, 3], |         'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', 'LOCATION'] | ||||||
|         ['ROOT', '-', 'QUALITY', 'PLACE', '-', 'LOCATION'] |     }), | ||||||
|     ), |     ("find a good italian restaurant near work", { | ||||||
|     ( |         'heads': [0, 4, 4, 4, 0, 4, 5], | ||||||
|         ['show', 'me', 'the', 'coolest', 'hostel', 'in', 'berlin'], |         'deps': ['ROOT', '-', 'QUALITY', 'ATTRIBUTE', 'PLACE', 'ATTRIBUTE', 'LOCATION'] | ||||||
|         [0, 0, 4, 4, 0, 4, 4], |     }) | ||||||
|         ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', 'LOCATION'] |  | ||||||
|     ), |  | ||||||
|     ( |  | ||||||
|         ['find', 'a', 'good', 'italian', 'restaurant', 'near', 'work'], |  | ||||||
|         [0, 4, 4, 4, 0, 4, 5], |  | ||||||
|         ['ROOT', '-', 'QUALITY', 'ATTRIBUTE', 'PLACE', 'ATTRIBUTE', 'LOCATION'] |  | ||||||
|     ) |  | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -88,8 +82,8 @@ def main(model=None, output_dir=None, n_iter=100): | ||||||
|     else: |     else: | ||||||
|         parser = nlp.get_pipe('parser') |         parser = nlp.get_pipe('parser') | ||||||
| 
 | 
 | ||||||
|     for _, _, deps in TRAIN_DATA: |     for text, annotations in TRAIN_DATA: | ||||||
|         for dep in deps: |         for dep in annotations.get('deps', []): | ||||||
|             parser.add_label(dep) |             parser.add_label(dep) | ||||||
| 
 | 
 | ||||||
|     other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser'] |     other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser'] | ||||||
|  | @ -98,10 +92,8 @@ def main(model=None, output_dir=None, n_iter=100): | ||||||
|         for itn in range(n_iter): |         for itn in range(n_iter): | ||||||
|             random.shuffle(TRAIN_DATA) |             random.shuffle(TRAIN_DATA) | ||||||
|             losses = {} |             losses = {} | ||||||
|             for words, heads, deps in TRAIN_DATA: |             for text, annotations in TRAIN_DATA: | ||||||
|                 doc = Doc(nlp.vocab, words=words) |                 nlp.update([text], [annotations], sgd=optimizer, losses=losses) | ||||||
|                 gold = GoldParse(doc, heads=heads, deps=deps) |  | ||||||
|                 nlp.update([doc], [gold], sgd=optimizer, losses=losses) |  | ||||||
|             print(losses) |             print(losses) | ||||||
| 
 | 
 | ||||||
|     # test the trained model |     # test the trained model | ||||||
|  | @ -147,6 +139,7 @@ if __name__ == '__main__': | ||||||
|     #   ('find', 'ROOT', 'find'), |     #   ('find', 'ROOT', 'find'), | ||||||
|     #   ('cheapest', 'QUALITY', 'gym'), |     #   ('cheapest', 'QUALITY', 'gym'), | ||||||
|     #   ('gym', 'PLACE', 'find') |     #   ('gym', 'PLACE', 'find') | ||||||
|  |     #   ('work', 'LOCATION', 'near') | ||||||
|     # ] |     # ] | ||||||
|     # show me the best hotel in berlin |     # show me the best hotel in berlin | ||||||
|     # [ |     # [ | ||||||
|  |  | ||||||
|  | @ -8,22 +8,24 @@ For more details, see the documentation: | ||||||
| * NER: https://alpha.spacy.io/usage/linguistic-features#named-entities | * NER: https://alpha.spacy.io/usage/linguistic-features#named-entities | ||||||
| 
 | 
 | ||||||
| Developed for: spaCy 2.0.0a18 | Developed for: spaCy 2.0.0a18 | ||||||
| Last updated for: spaCy 2.0.0a18 | Last updated for: spaCy 2.0.0a19 | ||||||
| """ | """ | ||||||
| from __future__ import unicode_literals, print_function | from __future__ import unicode_literals, print_function | ||||||
| 
 | 
 | ||||||
| import plac | import plac | ||||||
| import random | import random | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| 
 |  | ||||||
| import spacy | import spacy | ||||||
| from spacy.gold import GoldParse, biluo_tags_from_offsets |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # training data | # training data | ||||||
| TRAIN_DATA = [ | TRAIN_DATA = [ | ||||||
|     ('Who is Shaka Khan?', [(7, 17, 'PERSON')]), |     ('Who is Shaka Khan?', { | ||||||
|     ('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')]) |         'entities': [(7, 17, 'PERSON')] | ||||||
|  |     }), | ||||||
|  |     ('I like London and Berlin.', { | ||||||
|  |         'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')] | ||||||
|  |     }) | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -45,25 +47,28 @@ def main(model=None, output_dir=None, n_iter=100): | ||||||
|     if 'ner' not in nlp.pipe_names: |     if 'ner' not in nlp.pipe_names: | ||||||
|         ner = nlp.create_pipe('ner') |         ner = nlp.create_pipe('ner') | ||||||
|         nlp.add_pipe(ner, last=True) |         nlp.add_pipe(ner, last=True) | ||||||
|  |     # otherwise, get it so we can add labels | ||||||
|  |     else: | ||||||
|  |         ner = nlp.get_pipe('ner') | ||||||
| 
 | 
 | ||||||
|     # function that allows begin_training to get the training data |     # add labels | ||||||
|     get_data = lambda: reformat_train_data(nlp.tokenizer, TRAIN_DATA) |     for _, annotations in TRAIN_DATA: | ||||||
|  |         for ent in annotations.get('entities'): | ||||||
|  |             ner.add_label(ent[2]) | ||||||
| 
 | 
 | ||||||
|     # get names of other pipes to disable them during training |     # get names of other pipes to disable them during training | ||||||
|     other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] |     other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] | ||||||
|     with nlp.disable_pipes(*other_pipes):  # only train NER |     with nlp.disable_pipes(*other_pipes):  # only train NER | ||||||
|         optimizer = nlp.begin_training(get_data) |         optimizer = nlp.begin_training() | ||||||
|         for itn in range(n_iter): |         for itn in range(n_iter): | ||||||
|             random.shuffle(TRAIN_DATA) |             random.shuffle(TRAIN_DATA) | ||||||
|             losses = {} |             losses = {} | ||||||
|             for raw_text, entity_offsets in TRAIN_DATA: |             for text, annotations in TRAIN_DATA: | ||||||
|                 doc = nlp.make_doc(raw_text) |  | ||||||
|                 gold = GoldParse(doc, entities=entity_offsets) |  | ||||||
|                 nlp.update( |                 nlp.update( | ||||||
|                     [doc], # Batch of Doc objects |                     [text],  # batch of texts | ||||||
|                     [gold], # Batch of GoldParse objects |                     [annotations],  # batch of annotations | ||||||
|                     drop=0.5, # Dropout -- make it harder to memorise data |                     drop=0.5,  # dropout - make it harder to memorise data | ||||||
|                     sgd=optimizer, # Callable to update weights |                     sgd=optimizer,  # callable to update weights | ||||||
|                     losses=losses) |                     losses=losses) | ||||||
|             print(losses) |             print(losses) | ||||||
| 
 | 
 | ||||||
|  | @ -90,25 +95,13 @@ def main(model=None, output_dir=None, n_iter=100): | ||||||
|             print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc]) |             print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc]) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def reformat_train_data(tokenizer, examples): |  | ||||||
|     """Reformat data to match JSON format. |  | ||||||
|     https://alpha.spacy.io/api/annotation#json-input |  | ||||||
| 
 |  | ||||||
|     tokenizer (Tokenizer): Tokenizer to process the raw text. |  | ||||||
|     examples (list): The trainig data. |  | ||||||
|     RETURNS (list): The reformatted training data.""" |  | ||||||
|     output = [] |  | ||||||
|     for i, (text, entity_offsets) in enumerate(examples): |  | ||||||
|         doc = tokenizer(text) |  | ||||||
|         ner_tags = biluo_tags_from_offsets(tokenizer(text), entity_offsets) |  | ||||||
|         words = [w.text for w in doc] |  | ||||||
|         tags = ['-'] * len(doc) |  | ||||||
|         heads = [0] * len(doc) |  | ||||||
|         deps = [''] * len(doc) |  | ||||||
|         sentence = (range(len(doc)), words, tags, heads, deps, ner_tags) |  | ||||||
|         output.append((text, [(sentence, [])])) |  | ||||||
|     return output |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||||
|     plac.call(main) |     plac.call(main) | ||||||
|  | 
 | ||||||
|  |     # Expected output: | ||||||
|  |     # Entities [('Shaka Khan', 'PERSON')] | ||||||
|  |     # Tokens [('Who', '', 2), ('is', '', 2), ('Shaka', 'PERSON', 3), | ||||||
|  |     # ('Khan', 'PERSON', 1), ('?', '', 2)] | ||||||
|  |     # Entities [('London', 'LOC'), ('Berlin', 'LOC')] | ||||||
|  |     # Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3), | ||||||
|  |     # ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)] | ||||||
|  |  | ||||||
|  | @ -24,16 +24,14 @@ For more details, see the documentation: | ||||||
| * NER: https://alpha.spacy.io/usage/linguistic-features#named-entities | * NER: https://alpha.spacy.io/usage/linguistic-features#named-entities | ||||||
| 
 | 
 | ||||||
| Developed for: spaCy 2.0.0a18 | Developed for: spaCy 2.0.0a18 | ||||||
| Last updated for: spaCy 2.0.0a18 | Last updated for: spaCy 2.0.0a19 | ||||||
| """ | """ | ||||||
| from __future__ import unicode_literals, print_function | from __future__ import unicode_literals, print_function | ||||||
| 
 | 
 | ||||||
| import plac | import plac | ||||||
| import random | import random | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| 
 |  | ||||||
| import spacy | import spacy | ||||||
| from spacy.gold import GoldParse, minibatch |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # new entity label | # new entity label | ||||||
|  | @ -45,20 +43,29 @@ LABEL = 'ANIMAL' | ||||||
| # model might learn the new type, but "forget" what it previously knew. | # model might learn the new type, but "forget" what it previously knew. | ||||||
| # https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting | # https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting | ||||||
| TRAIN_DATA = [ | TRAIN_DATA = [ | ||||||
|     ("Horses are too tall and they pretend to care about your feelings", |     ("Horses are too tall and they pretend to care about your feelings", { | ||||||
|      [(0, 6, 'ANIMAL')]), |         'entities': [(0, 6, 'ANIMAL')] | ||||||
|  |     }), | ||||||
| 
 | 
 | ||||||
|     ("Do they bite?", []), |     ("Do they bite?", { | ||||||
|  |         'entities': [] | ||||||
|  |     }), | ||||||
| 
 | 
 | ||||||
|     ("horses are too tall and they pretend to care about your feelings", |     ("horses are too tall and they pretend to care about your feelings", { | ||||||
|      [(0, 6, 'ANIMAL')]), |         'entities': [(0, 6, 'ANIMAL')] | ||||||
|  |     }), | ||||||
| 
 | 
 | ||||||
|     ("horses pretend to care about your feelings", [(0, 6, 'ANIMAL')]), |     ("horses pretend to care about your feelings", { | ||||||
|  |         'entities': [(0, 6, 'ANIMAL')] | ||||||
|  |     }), | ||||||
| 
 | 
 | ||||||
|     ("they pretend to care about your feelings, those horses", |     ("they pretend to care about your feelings, those horses", { | ||||||
|      [(48, 54, 'ANIMAL')]), |         'entities': [(48, 54, 'ANIMAL')] | ||||||
|  |     }), | ||||||
| 
 | 
 | ||||||
|     ("horses?", [(0, 6, 'ANIMAL')]) |     ("horses?", { | ||||||
|  |         'entities': [(0, 6, 'ANIMAL')] | ||||||
|  |     }) | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -90,15 +97,13 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=50): | ||||||
|     # get names of other pipes to disable them during training |     # get names of other pipes to disable them during training | ||||||
|     other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] |     other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] | ||||||
|     with nlp.disable_pipes(*other_pipes):  # only train NER |     with nlp.disable_pipes(*other_pipes):  # only train NER | ||||||
|         random.seed(0) |  | ||||||
|         optimizer = nlp.begin_training() |         optimizer = nlp.begin_training() | ||||||
|         for itn in range(n_iter): |         for itn in range(n_iter): | ||||||
|  |             random.shuffle(TRAIN_DATA) | ||||||
|             losses = {} |             losses = {} | ||||||
|             gold_parses = get_gold_parses(nlp.make_doc, TRAIN_DATA) |             for text, annotations in TRAIN_DATA: | ||||||
|             for batch in minibatch(gold_parses, size=3): |                 nlp.update([text], [annotations], sgd=optimizer, drop=0.35, | ||||||
|                 docs, golds = zip(*batch) |                            losses=losses) | ||||||
|                 nlp.update(docs, golds, losses=losses, sgd=optimizer, |  | ||||||
|                            drop=0.35) |  | ||||||
|             print(losses) |             print(losses) | ||||||
| 
 | 
 | ||||||
|     # test the trained model |     # test the trained model | ||||||
|  | @ -125,19 +130,5 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=50): | ||||||
|             print(ent.label_, ent.text) |             print(ent.label_, ent.text) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_gold_parses(tokenizer, train_data): |  | ||||||
|     """Shuffle and create GoldParse objects. |  | ||||||
| 
 |  | ||||||
|     tokenizer (Tokenizer): Tokenizer to processs the raw text. |  | ||||||
|     train_data (list): The training data. |  | ||||||
|     YIELDS (tuple): (doc, gold) tuples. |  | ||||||
|     """ |  | ||||||
|     random.shuffle(train_data) |  | ||||||
|     for raw_text, entity_offsets in train_data: |  | ||||||
|         doc = tokenizer(raw_text) |  | ||||||
|         gold = GoldParse(doc, entities=entity_offsets) |  | ||||||
|         yield doc, gold |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||||
|     plac.call(main) |     plac.call(main) | ||||||
|  |  | ||||||
|  | @ -13,24 +13,19 @@ from __future__ import unicode_literals, print_function | ||||||
| import plac | import plac | ||||||
| import random | import random | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| 
 |  | ||||||
| import spacy | import spacy | ||||||
| from spacy.gold import GoldParse |  | ||||||
| from spacy.tokens import Doc |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # training data | # training data | ||||||
| TRAIN_DATA = [ | TRAIN_DATA = [ | ||||||
|     ( |     ("They trade mortgage-backed securities.", { | ||||||
|         ['They', 'trade',  'mortgage', '-', 'backed', 'securities', '.'], |         'heads': [1, 1, 4, 4, 5, 1, 1], | ||||||
|         [1, 1, 4, 4, 5, 1, 1], |         'deps': ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct'] | ||||||
|         ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct'] |     }), | ||||||
|     ), |     ("I like London and Berlin", { | ||||||
|     ( |         'heads': [1, 1, 1, 2, 2, 1], | ||||||
|         ['I', 'like', 'London', 'and', 'Berlin', '.'], |         'deps': ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct'] | ||||||
|         [1, 1, 1, 2, 2, 1], |     }) | ||||||
|         ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct'] |  | ||||||
|     ) |  | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -38,7 +33,7 @@ TRAIN_DATA = [ | ||||||
|     model=("Model name. Defaults to blank 'en' model.", "option", "m", str), |     model=("Model name. Defaults to blank 'en' model.", "option", "m", str), | ||||||
|     output_dir=("Optional output directory", "option", "o", Path), |     output_dir=("Optional output directory", "option", "o", Path), | ||||||
|     n_iter=("Number of training iterations", "option", "n", int)) |     n_iter=("Number of training iterations", "option", "n", int)) | ||||||
| def main(model=None, output_dir=None, n_iter=1000): | def main(model=None, output_dir=None, n_iter=10): | ||||||
|     """Load the model, set up the pipeline and train the parser.""" |     """Load the model, set up the pipeline and train the parser.""" | ||||||
|     if model is not None: |     if model is not None: | ||||||
|         nlp = spacy.load(model)  # load existing spaCy model |         nlp = spacy.load(model)  # load existing spaCy model | ||||||
|  | @ -57,8 +52,8 @@ def main(model=None, output_dir=None, n_iter=1000): | ||||||
|         parser = nlp.get_pipe('parser') |         parser = nlp.get_pipe('parser') | ||||||
| 
 | 
 | ||||||
|     # add labels to the parser |     # add labels to the parser | ||||||
|     for _, _, deps in TRAIN_DATA: |     for _, annotations in TRAIN_DATA: | ||||||
|         for dep in deps: |         for dep in annotations.get('deps', []): | ||||||
|             parser.add_label(dep) |             parser.add_label(dep) | ||||||
| 
 | 
 | ||||||
|     # get names of other pipes to disable them during training |     # get names of other pipes to disable them during training | ||||||
|  | @ -68,10 +63,8 @@ def main(model=None, output_dir=None, n_iter=1000): | ||||||
|         for itn in range(n_iter): |         for itn in range(n_iter): | ||||||
|             random.shuffle(TRAIN_DATA) |             random.shuffle(TRAIN_DATA) | ||||||
|             losses = {} |             losses = {} | ||||||
|             for words, heads, deps in TRAIN_DATA: |             for text, annotations in TRAIN_DATA: | ||||||
|                 doc = Doc(nlp.vocab, words=words) |                 nlp.update([text], [annotations], sgd=optimizer, losses=losses) | ||||||
|                 gold = GoldParse(doc, heads=heads, deps=deps) |  | ||||||
|                 nlp.update([doc], [gold], sgd=optimizer, losses=losses) |  | ||||||
|             print(losses) |             print(losses) | ||||||
| 
 | 
 | ||||||
|     # test the trained model |     # test the trained model | ||||||
|  |  | ||||||
|  | @ -9,17 +9,14 @@ the documentation: | ||||||
| * POS Tagging: https://alpha.spacy.io/usage/linguistic-features#pos-tagging | * POS Tagging: https://alpha.spacy.io/usage/linguistic-features#pos-tagging | ||||||
| 
 | 
 | ||||||
| Developed for: spaCy 2.0.0a18 | Developed for: spaCy 2.0.0a18 | ||||||
| Last updated for: spaCy 2.0.0a18 | Last updated for: spaCy 2.0.0a19 | ||||||
| """ | """ | ||||||
| from __future__ import unicode_literals, print_function | from __future__ import unicode_literals, print_function | ||||||
| 
 | 
 | ||||||
| import plac | import plac | ||||||
| import random | import random | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| 
 |  | ||||||
| import spacy | import spacy | ||||||
| from spacy.tokens import Doc |  | ||||||
| from spacy.gold import GoldParse |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # You need to define a mapping from your data's part-of-speech tag names to the | # You need to define a mapping from your data's part-of-speech tag names to the | ||||||
|  | @ -29,16 +26,16 @@ from spacy.gold import GoldParse | ||||||
| # You may also specify morphological features for your tags, from the universal | # You may also specify morphological features for your tags, from the universal | ||||||
| # scheme. | # scheme. | ||||||
| TAG_MAP = { | TAG_MAP = { | ||||||
|     'N': {"pos": "NOUN"}, |     'N': {'pos': 'NOUN'}, | ||||||
|     'V': {"pos": "VERB"}, |     'V': {'pos': 'VERB'}, | ||||||
|     'J': {"pos": "ADJ"} |     'J': {'pos': 'ADJ'} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| # Usually you'll read this in, of course. Data formats vary. | # Usually you'll read this in, of course. Data formats vary. | ||||||
| # Ensure your strings are unicode. | # Ensure your strings are unicode. | ||||||
| TRAIN_DATA = [ | TRAIN_DATA = [ | ||||||
|     (["I", "like", "green", "eggs"], ["N", "V", "J", "N"]), |     ("I like green eggs", {'tags': ['N', 'V', 'J', 'N']}), | ||||||
|     (["Eat", "blue", "ham"], ["V", "J", "N"]) |     ("Eat blue ham", {'tags': ['V', 'J', 'N']}) | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -64,10 +61,8 @@ def main(lang='en', output_dir=None, n_iter=25): | ||||||
|     for i in range(n_iter): |     for i in range(n_iter): | ||||||
|         random.shuffle(TRAIN_DATA) |         random.shuffle(TRAIN_DATA) | ||||||
|         losses = {} |         losses = {} | ||||||
|         for words, tags in TRAIN_DATA: |         for text, annotations in TRAIN_DATA: | ||||||
|             doc = Doc(nlp.vocab, words=words) |             nlp.update([text], [annotations], sgd=optimizer, losses=losses) | ||||||
|             gold = GoldParse(doc, tags=tags) |  | ||||||
|             nlp.update([doc], [gold], sgd=optimizer, losses=losses) |  | ||||||
|         print(losses) |         print(losses) | ||||||
| 
 | 
 | ||||||
|     # test the trained model |     # test the trained model | ||||||
|  |  | ||||||
|  | @ -9,7 +9,7 @@ see the documentation: | ||||||
| * Text classification: https://alpha.spacy.io/usage/text-classification | * Text classification: https://alpha.spacy.io/usage/text-classification | ||||||
| 
 | 
 | ||||||
| Developed for: spaCy 2.0.0a18 | Developed for: spaCy 2.0.0a18 | ||||||
| Last updated for: spaCy 2.0.0a18 | Last updated for: spaCy 2.0.0a19 | ||||||
| """ | """ | ||||||
| from __future__ import unicode_literals, print_function | from __future__ import unicode_literals, print_function | ||||||
| import plac | import plac | ||||||
|  | @ -18,9 +18,8 @@ from pathlib import Path | ||||||
| import thinc.extra.datasets | import thinc.extra.datasets | ||||||
| 
 | 
 | ||||||
| import spacy | import spacy | ||||||
| from spacy.gold import GoldParse, minibatch | from spacy.gold import minibatch | ||||||
| from spacy.util import compounding | from spacy.util import compounding | ||||||
| from spacy.pipeline import TextCategorizer |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @plac.annotations( | @plac.annotations( | ||||||
|  | @ -52,10 +51,8 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000): | ||||||
|     print("Loading IMDB data...") |     print("Loading IMDB data...") | ||||||
|     (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts) |     (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts) | ||||||
|     print("Using %d training examples" % n_texts) |     print("Using %d training examples" % n_texts) | ||||||
|     train_docs = [nlp.tokenizer(text) for text in train_texts] |     train_data = list(zip(train_texts, | ||||||
|     train_gold = [GoldParse(doc, cats=cats) for doc, cats in |                           [{'cats': cats} for cats in train_cats])) | ||||||
|                   zip(train_docs, train_cats)] |  | ||||||
|     train_data = list(zip(train_docs, train_gold)) |  | ||||||
| 
 | 
 | ||||||
|     # get names of other pipes to disable them during training |     # get names of other pipes to disable them during training | ||||||
|     other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat'] |     other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat'] | ||||||
|  | @ -68,8 +65,9 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000): | ||||||
|             # batch up the examples using spaCy's minibatch |             # batch up the examples using spaCy's minibatch | ||||||
|             batches = minibatch(train_data, size=compounding(4., 32., 1.001)) |             batches = minibatch(train_data, size=compounding(4., 32., 1.001)) | ||||||
|             for batch in batches: |             for batch in batches: | ||||||
|                 docs, golds = zip(*batch) |                 texts, annotations = zip(*batch) | ||||||
|                 nlp.update(docs, golds, sgd=optimizer, drop=0.2, losses=losses) |                 nlp.update(texts, annotations, sgd=optimizer, drop=0.2, | ||||||
|  |                            losses=losses) | ||||||
|             with textcat.model.use_params(optimizer.averages): |             with textcat.model.use_params(optimizer.averages): | ||||||
|                 # evaluate on the dev data split off in load_data() |                 # evaluate on the dev data split off in load_data() | ||||||
|                 scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) |                 scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user