From 399987c2166b55a9e99e1286b7feec229927cd6a Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 16 Mar 2019 14:15:49 +0100
Subject: [PATCH] Test and update examples [ci skip]

---
 .../entity_relations.py                       |  1 +
 .../information_extraction/parse_subtrees.py  |  1 +
 examples/pipeline/custom_attr_methods.py      |  1 +
 .../custom_component_countries_api.py         |  1 +
 .../pipeline/custom_component_entities.py     |  1 +
 .../pipeline/custom_sentence_segmentation.py  | 32 ++++++++++++-------
 examples/pipeline/fix_space_entities.py       | 11 +++++--
 examples/pipeline/multi_processing.py         |  2 ++
 examples/training/train_ner.py                |  1 +
 examples/training/train_new_entity_type.py    |  5 +--
 examples/training/train_parser.py             |  3 +-
 examples/training/train_tagger.py             |  1 +
 examples/training/train_textcat.py            |  8 ++---
 13 files changed, 48 insertions(+), 20 deletions(-)

diff --git a/examples/information_extraction/entity_relations.py b/examples/information_extraction/entity_relations.py
index aab5d4f33..ffc8164e1 100644
--- a/examples/information_extraction/entity_relations.py
+++ b/examples/information_extraction/entity_relations.py
@@ -7,6 +7,7 @@ dependency tree to find the noun phrase they are referring to – for example:
 $9.4 million --> Net income.
 
 Compatible with: spaCy v2.0.0+
+Last tested with: v2.1.0
 """
 from __future__ import unicode_literals, print_function
 
diff --git a/examples/information_extraction/parse_subtrees.py b/examples/information_extraction/parse_subtrees.py
index 55968c9da..2ca9da1ea 100644
--- a/examples/information_extraction/parse_subtrees.py
+++ b/examples/information_extraction/parse_subtrees.py
@@ -17,6 +17,7 @@ show you how computers understand [language]
 I'm assuming that we can use the token.head to build these groups."
 
 Compatible with: spaCy v2.0.0+
+Last tested with: v2.1.0
 """
 from __future__ import unicode_literals, print_function
 
diff --git a/examples/pipeline/custom_attr_methods.py b/examples/pipeline/custom_attr_methods.py
index 59ce1d0e5..7f97bc1c3 100644
--- a/examples/pipeline/custom_attr_methods.py
+++ b/examples/pipeline/custom_attr_methods.py
@@ -8,6 +8,7 @@ they're called on is passed in as the first argument.
 * Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components
 
 Compatible with: spaCy v2.0.0+
+Last tested with: v2.1.0
 """
 from __future__ import unicode_literals, print_function
 
diff --git a/examples/pipeline/custom_component_countries_api.py b/examples/pipeline/custom_component_countries_api.py
index 091d331fc..241c0af37 100644
--- a/examples/pipeline/custom_component_countries_api.py
+++ b/examples/pipeline/custom_component_countries_api.py
@@ -9,6 +9,7 @@ coordinates. Can be extended with more details from the API.
 * Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components
 
 Compatible with: spaCy v2.0.0+
+Last tested with: v2.1.0
 Prerequisites: pip install requests
 """
 from __future__ import unicode_literals, print_function
diff --git a/examples/pipeline/custom_component_entities.py b/examples/pipeline/custom_component_entities.py
index c7f48d504..a53b688b0 100644
--- a/examples/pipeline/custom_component_entities.py
+++ b/examples/pipeline/custom_component_entities.py
@@ -9,6 +9,7 @@ respectively.
 * Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components
 
 Compatible with: spaCy v2.0.0+
+Last tested with: v2.1.0
 """
 from __future__ import unicode_literals, print_function
 
diff --git a/examples/pipeline/custom_sentence_segmentation.py b/examples/pipeline/custom_sentence_segmentation.py
index 43ac64979..ff59ab187 100644
--- a/examples/pipeline/custom_sentence_segmentation.py
+++ b/examples/pipeline/custom_sentence_segmentation.py
@@ -10,6 +10,9 @@ should also improve the parse quality.
 The specific example here is drawn from https://github.com/explosion/spaCy/issues/2627
 Other versions of the model may not make the original mistake, so the specific
 example might not be apt for future versions.
+
+Compatible with: spaCy v2.0.0+
+Last tested with: v2.1.0
 """
 import plac
 import spacy
@@ -25,8 +28,10 @@ def prevent_sentence_boundaries(doc):
 def can_be_sentence_start(token):
     if token.i == 0:
         return True
-    elif token.is_title:
-        return True
+    # We're not checking for is_title here to ignore arbitrary titlecased
+    # tokens within sentences
+    # elif token.is_title:
+    #    return True
     elif token.nbor(-1).is_punct:
         return True
     elif token.nbor(-1).is_space:
@@ -35,16 +40,21 @@ def can_be_sentence_start(token):
         return False
 
 
-def main():
-    nlp = spacy.load("en_core_web_lg")
-    raw_text = "Been here and I'm loving it."
-    doc = nlp(raw_text)
-    sentences = [sent.string.strip() for sent in doc.sents]
-    print(sentences)
+@plac.annotations(
+    text=("The raw text to process", "positional", None, str),
+    spacy_model=("spaCy model to use (with a parser)", "option", "m", str),
+)
+def main(text="Been here And I'm loving it.", spacy_model="en_core_web_lg"):
+    print("Using spaCy model '{}'".format(spacy_model))
+    print("Processing text '{}'".format(text))
+    nlp = spacy.load(spacy_model)
+    doc = nlp(text)
+    sentences = [sent.text.strip() for sent in doc.sents]
+    print("Before:", sentences)
     nlp.add_pipe(prevent_sentence_boundaries, before="parser")
-    doc = nlp(raw_text)
-    sentences = [sent.string.strip() for sent in doc.sents]
-    print(sentences)
+    doc = nlp(text)
+    sentences = [sent.text.strip() for sent in doc.sents]
+    print("After:", sentences)
 
 
 if __name__ == "__main__":
diff --git a/examples/pipeline/fix_space_entities.py b/examples/pipeline/fix_space_entities.py
index e3d37ad38..686253eca 100644
--- a/examples/pipeline/fix_space_entities.py
+++ b/examples/pipeline/fix_space_entities.py
@@ -1,7 +1,14 @@
+#!/usr/bin/env python
+# coding: utf8
 """Demonstrate adding a rule-based component that forces some tokens to not
 be entities, before the NER tagger is applied. This is used to hotfix the issue
-in https://github.com/explosion/spaCy/issues/2870 , present as of spaCy v2.0.16.
+in https://github.com/explosion/spaCy/issues/2870, present as of spaCy v2.0.16.
+
+Compatible with: spaCy v2.0.0+
+Last tested with: v2.1.0
 """
+from __future__ import unicode_literals
+
 import spacy
 from spacy.attrs import ENT_IOB
 
@@ -18,7 +25,7 @@ def fix_space_tags(doc):
 
 def main():
     nlp = spacy.load("en_core_web_sm")
-    text = u"""This is some crazy test where I dont need an Apple                Watch to make things bug"""
+    text = "This is some crazy test where I dont need an Apple                Watch to make things bug"
     doc = nlp(text)
     print("Before", doc.ents)
     nlp.add_pipe(fix_space_tags, name="fix-ner", before="ner")
diff --git a/examples/pipeline/multi_processing.py b/examples/pipeline/multi_processing.py
index 4c4d29c13..f0e437acf 100644
--- a/examples/pipeline/multi_processing.py
+++ b/examples/pipeline/multi_processing.py
@@ -7,6 +7,8 @@ the IMDB movie reviews dataset and will be loaded automatically via Thinc's
 built-in dataset loader.
 
 Compatible with: spaCy v2.0.0+
+Last tested with: v2.1.0
+Prerequisites: pip install joblib
 """
 from __future__ import print_function, unicode_literals
 
diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py
index 797dbcb9c..49c25654c 100644
--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@@ -8,6 +8,7 @@ For more details, see the documentation:
 * NER: https://spacy.io/usage/linguistic-features#named-entities
 
 Compatible with: spaCy v2.0.0+
+Last tested with: v2.1.0
 """
 from __future__ import unicode_literals, print_function
 
diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py
index b6fc84590..31976373b 100644
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@@ -24,6 +24,7 @@ For more details, see the documentation:
 * NER: https://spacy.io/usage/linguistic-features#named-entities
 
 Compatible with: spaCy v2.0.0+
+Last tested with: v2.1.0
 """
 from __future__ import unicode_literals, print_function
 
@@ -87,7 +88,7 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
 
     ner.add_label(LABEL)  # add new entity label to entity recognizer
     # Adding extraneous labels shouldn't mess anything up
-    ner.add_label('VEGETABLE')
+    ner.add_label("VEGETABLE")
     if model is None:
         optimizer = nlp.begin_training()
     else:
@@ -127,7 +128,7 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
         print("Loading from", output_dir)
         nlp2 = spacy.load(output_dir)
         # Check the classes have loaded back consistently
-        assert nlp2.get_pipe('ner').move_names == move_names
+        assert nlp2.get_pipe("ner").move_names == move_names
         doc2 = nlp2(test_text)
         for ent in doc2.ents:
             print(ent.label_, ent.text)
diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py
index aff33c88f..aa60af00b 100644
--- a/examples/training/train_parser.py
+++ b/examples/training/train_parser.py
@@ -6,6 +6,7 @@ model or a blank model. For more details, see the documentation:
 * Dependency Parse: https://spacy.io/usage/linguistic-features#dependency-parse
 
 Compatible with: spaCy v2.0.0+
+Last tested with: v2.1.0
 """
 from __future__ import unicode_literals, print_function
 
@@ -40,7 +41,7 @@ TRAIN_DATA = [
     output_dir=("Optional output directory", "option", "o", Path),
     n_iter=("Number of training iterations", "option", "n", int),
 )
-def main(model=None, output_dir=None, n_iter=10):
+def main(model=None, output_dir=None, n_iter=15):
     """Load the model, set up the pipeline and train the parser."""
     if model is not None:
         nlp = spacy.load(model)  # load existing spaCy model
diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py
index db0627270..7136273b3 100644
--- a/examples/training/train_tagger.py
+++ b/examples/training/train_tagger.py
@@ -9,6 +9,7 @@ the documentation:
 * POS Tagging: https://spacy.io/usage/linguistic-features#pos-tagging
 
 Compatible with: spaCy v2.0.0+
+Last tested with: v2.1.0
 """
 from __future__ import unicode_literals, print_function
 
diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py
index 7b55afe04..bd9e5ee18 100644
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@@ -41,9 +41,9 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
     # add the text classifier to the pipeline if it doesn't exist
     # nlp.create_pipe works for built-ins that are registered with spaCy
     if "textcat" not in nlp.pipe_names:
-        textcat = nlp.create_pipe("textcat", config={
-            "architecture": "simple_cnn",
-            "exclusive_classes": True})
+        textcat = nlp.create_pipe(
+            "textcat", config={"architecture": "simple_cnn", "exclusive_classes": True}
+        )
         nlp.add_pipe(textcat, last=True)
     # otherwise, get it, so we can add labels to it
     else:
@@ -140,7 +140,7 @@ def evaluate(tokenizer, textcat, texts, cats):
                 fn += 1
     precision = tp / (tp + fp)
     recall = tp / (tp + fn)
-    if (precision+recall) == 0:
+    if (precision + recall) == 0:
         f_score = 0.0
     else:
         f_score = 2 * (precision * recall) / (precision + recall)