From 399987c2166b55a9e99e1286b7feec229927cd6a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 16 Mar 2019 14:15:49 +0100 Subject: [PATCH] Test and update examples [ci skip] --- .../entity_relations.py | 1 + .../information_extraction/parse_subtrees.py | 1 + examples/pipeline/custom_attr_methods.py | 1 + .../custom_component_countries_api.py | 1 + .../pipeline/custom_component_entities.py | 1 + .../pipeline/custom_sentence_segmentation.py | 32 ++++++++++++------- examples/pipeline/fix_space_entities.py | 11 +++++-- examples/pipeline/multi_processing.py | 2 ++ examples/training/train_ner.py | 1 + examples/training/train_new_entity_type.py | 5 +-- examples/training/train_parser.py | 3 +- examples/training/train_tagger.py | 1 + examples/training/train_textcat.py | 8 ++--- 13 files changed, 48 insertions(+), 20 deletions(-) diff --git a/examples/information_extraction/entity_relations.py b/examples/information_extraction/entity_relations.py index aab5d4f33..ffc8164e1 100644 --- a/examples/information_extraction/entity_relations.py +++ b/examples/information_extraction/entity_relations.py @@ -7,6 +7,7 @@ dependency tree to find the noun phrase they are referring to – for example: $9.4 million --> Net income. Compatible with: spaCy v2.0.0+ +Last tested with: v2.1.0 """ from __future__ import unicode_literals, print_function diff --git a/examples/information_extraction/parse_subtrees.py b/examples/information_extraction/parse_subtrees.py index 55968c9da..2ca9da1ea 100644 --- a/examples/information_extraction/parse_subtrees.py +++ b/examples/information_extraction/parse_subtrees.py @@ -17,6 +17,7 @@ show you how computers understand [language] I'm assuming that we can use the token.head to build these groups." Compatible with: spaCy v2.0.0+ +Last tested with: v2.1.0 """ from __future__ import unicode_literals, print_function diff --git a/examples/pipeline/custom_attr_methods.py b/examples/pipeline/custom_attr_methods.py index 59ce1d0e5..7f97bc1c3 100644 --- a/examples/pipeline/custom_attr_methods.py +++ b/examples/pipeline/custom_attr_methods.py @@ -8,6 +8,7 @@ they're called on is passed in as the first argument. * Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components Compatible with: spaCy v2.0.0+ +Last tested with: v2.1.0 """ from __future__ import unicode_literals, print_function diff --git a/examples/pipeline/custom_component_countries_api.py b/examples/pipeline/custom_component_countries_api.py index 091d331fc..241c0af37 100644 --- a/examples/pipeline/custom_component_countries_api.py +++ b/examples/pipeline/custom_component_countries_api.py @@ -9,6 +9,7 @@ coordinates. Can be extended with more details from the API. * Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components Compatible with: spaCy v2.0.0+ +Last tested with: v2.1.0 Prerequisites: pip install requests """ from __future__ import unicode_literals, print_function diff --git a/examples/pipeline/custom_component_entities.py b/examples/pipeline/custom_component_entities.py index c7f48d504..a53b688b0 100644 --- a/examples/pipeline/custom_component_entities.py +++ b/examples/pipeline/custom_component_entities.py @@ -9,6 +9,7 @@ respectively. * Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components Compatible with: spaCy v2.0.0+ +Last tested with: v2.1.0 """ from __future__ import unicode_literals, print_function diff --git a/examples/pipeline/custom_sentence_segmentation.py b/examples/pipeline/custom_sentence_segmentation.py index 43ac64979..ff59ab187 100644 --- a/examples/pipeline/custom_sentence_segmentation.py +++ b/examples/pipeline/custom_sentence_segmentation.py @@ -10,6 +10,9 @@ should also improve the parse quality. The specific example here is drawn from https://github.com/explosion/spaCy/issues/2627 Other versions of the model may not make the original mistake, so the specific example might not be apt for future versions. + +Compatible with: spaCy v2.0.0+ +Last tested with: v2.1.0 """ import plac import spacy @@ -25,8 +28,10 @@ def prevent_sentence_boundaries(doc): def can_be_sentence_start(token): if token.i == 0: return True - elif token.is_title: - return True + # We're not checking for is_title here to ignore arbitrary titlecased + # tokens within sentences + # elif token.is_title: + # return True elif token.nbor(-1).is_punct: return True elif token.nbor(-1).is_space: @@ -35,16 +40,21 @@ def can_be_sentence_start(token): return False -def main(): - nlp = spacy.load("en_core_web_lg") - raw_text = "Been here and I'm loving it." - doc = nlp(raw_text) - sentences = [sent.string.strip() for sent in doc.sents] - print(sentences) +@plac.annotations( + text=("The raw text to process", "positional", None, str), + spacy_model=("spaCy model to use (with a parser)", "option", "m", str), +) +def main(text="Been here And I'm loving it.", spacy_model="en_core_web_lg"): + print("Using spaCy model '{}'".format(spacy_model)) + print("Processing text '{}'".format(text)) + nlp = spacy.load(spacy_model) + doc = nlp(text) + sentences = [sent.text.strip() for sent in doc.sents] + print("Before:", sentences) nlp.add_pipe(prevent_sentence_boundaries, before="parser") - doc = nlp(raw_text) - sentences = [sent.string.strip() for sent in doc.sents] - print(sentences) + doc = nlp(text) + sentences = [sent.text.strip() for sent in doc.sents] + print("After:", sentences) if __name__ == "__main__": diff --git a/examples/pipeline/fix_space_entities.py b/examples/pipeline/fix_space_entities.py index e3d37ad38..686253eca 100644 --- a/examples/pipeline/fix_space_entities.py +++ b/examples/pipeline/fix_space_entities.py @@ -1,7 +1,14 @@ +#!/usr/bin/env python +# coding: utf8 """Demonstrate adding a rule-based component that forces some tokens to not be entities, before the NER tagger is applied. This is used to hotfix the issue -in https://github.com/explosion/spaCy/issues/2870 , present as of spaCy v2.0.16. +in https://github.com/explosion/spaCy/issues/2870, present as of spaCy v2.0.16. + +Compatible with: spaCy v2.0.0+ +Last tested with: v2.1.0 """ +from __future__ import unicode_literals + import spacy from spacy.attrs import ENT_IOB @@ -18,7 +25,7 @@ def fix_space_tags(doc): def main(): nlp = spacy.load("en_core_web_sm") - text = u"""This is some crazy test where I dont need an Apple Watch to make things bug""" + text = "This is some crazy test where I dont need an Apple Watch to make things bug" doc = nlp(text) print("Before", doc.ents) nlp.add_pipe(fix_space_tags, name="fix-ner", before="ner") diff --git a/examples/pipeline/multi_processing.py b/examples/pipeline/multi_processing.py index 4c4d29c13..f0e437acf 100644 --- a/examples/pipeline/multi_processing.py +++ b/examples/pipeline/multi_processing.py @@ -7,6 +7,8 @@ the IMDB movie reviews dataset and will be loaded automatically via Thinc's built-in dataset loader. Compatible with: spaCy v2.0.0+ +Last tested with: v2.1.0 +Prerequisites: pip install joblib """ from __future__ import print_function, unicode_literals diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py index 797dbcb9c..49c25654c 100644 --- a/examples/training/train_ner.py +++ b/examples/training/train_ner.py @@ -8,6 +8,7 @@ For more details, see the documentation: * NER: https://spacy.io/usage/linguistic-features#named-entities Compatible with: spaCy v2.0.0+ +Last tested with: v2.1.0 """ from __future__ import unicode_literals, print_function diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index b6fc84590..31976373b 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -24,6 +24,7 @@ For more details, see the documentation: * NER: https://spacy.io/usage/linguistic-features#named-entities Compatible with: spaCy v2.0.0+ +Last tested with: v2.1.0 """ from __future__ import unicode_literals, print_function @@ -87,7 +88,7 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30): ner.add_label(LABEL) # add new entity label to entity recognizer # Adding extraneous labels shouldn't mess anything up - ner.add_label('VEGETABLE') + ner.add_label("VEGETABLE") if model is None: optimizer = nlp.begin_training() else: @@ -127,7 +128,7 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30): print("Loading from", output_dir) nlp2 = spacy.load(output_dir) # Check the classes have loaded back consistently - assert nlp2.get_pipe('ner').move_names == move_names + assert nlp2.get_pipe("ner").move_names == move_names doc2 = nlp2(test_text) for ent in doc2.ents: print(ent.label_, ent.text) diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py index aff33c88f..aa60af00b 100644 --- a/examples/training/train_parser.py +++ b/examples/training/train_parser.py @@ -6,6 +6,7 @@ model or a blank model. For more details, see the documentation: * Dependency Parse: https://spacy.io/usage/linguistic-features#dependency-parse Compatible with: spaCy v2.0.0+ +Last tested with: v2.1.0 """ from __future__ import unicode_literals, print_function @@ -40,7 +41,7 @@ TRAIN_DATA = [ output_dir=("Optional output directory", "option", "o", Path), n_iter=("Number of training iterations", "option", "n", int), ) -def main(model=None, output_dir=None, n_iter=10): +def main(model=None, output_dir=None, n_iter=15): """Load the model, set up the pipeline and train the parser.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py index db0627270..7136273b3 100644 --- a/examples/training/train_tagger.py +++ b/examples/training/train_tagger.py @@ -9,6 +9,7 @@ the documentation: * POS Tagging: https://spacy.io/usage/linguistic-features#pos-tagging Compatible with: spaCy v2.0.0+ +Last tested with: v2.1.0 """ from __future__ import unicode_literals, print_function diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py index 7b55afe04..bd9e5ee18 100644 --- a/examples/training/train_textcat.py +++ b/examples/training/train_textcat.py @@ -41,9 +41,9 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000): # add the text classifier to the pipeline if it doesn't exist # nlp.create_pipe works for built-ins that are registered with spaCy if "textcat" not in nlp.pipe_names: - textcat = nlp.create_pipe("textcat", config={ - "architecture": "simple_cnn", - "exclusive_classes": True}) + textcat = nlp.create_pipe( + "textcat", config={"architecture": "simple_cnn", "exclusive_classes": True} + ) nlp.add_pipe(textcat, last=True) # otherwise, get it, so we can add labels to it else: @@ -140,7 +140,7 @@ def evaluate(tokenizer, textcat, texts, cats): fn += 1 precision = tp / (tp + fp) recall = tp / (tp + fn) - if (precision+recall) == 0: + if (precision + recall) == 0: f_score = 0.0 else: f_score = 2 * (precision * recall) / (precision + recall)