mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Test and update examples [ci skip]
This commit is contained in:
parent
f55a52a2dd
commit
399987c216
|
@ -7,6 +7,7 @@ dependency tree to find the noun phrase they are referring to – for example:
|
||||||
$9.4 million --> Net income.
|
$9.4 million --> Net income.
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
Compatible with: spaCy v2.0.0+
|
||||||
|
Last tested with: v2.1.0
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,7 @@ show you how computers understand [language]
|
||||||
I'm assuming that we can use the token.head to build these groups."
|
I'm assuming that we can use the token.head to build these groups."
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
Compatible with: spaCy v2.0.0+
|
||||||
|
Last tested with: v2.1.0
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
|
|
|
@ -8,6 +8,7 @@ they're called on is passed in as the first argument.
|
||||||
* Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components
|
* Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
Compatible with: spaCy v2.0.0+
|
||||||
|
Last tested with: v2.1.0
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,7 @@ coordinates. Can be extended with more details from the API.
|
||||||
* Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components
|
* Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
Compatible with: spaCy v2.0.0+
|
||||||
|
Last tested with: v2.1.0
|
||||||
Prerequisites: pip install requests
|
Prerequisites: pip install requests
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
|
@ -9,6 +9,7 @@ respectively.
|
||||||
* Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components
|
* Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
Compatible with: spaCy v2.0.0+
|
||||||
|
Last tested with: v2.1.0
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
|
|
|
@ -10,6 +10,9 @@ should also improve the parse quality.
|
||||||
The specific example here is drawn from https://github.com/explosion/spaCy/issues/2627
|
The specific example here is drawn from https://github.com/explosion/spaCy/issues/2627
|
||||||
Other versions of the model may not make the original mistake, so the specific
|
Other versions of the model may not make the original mistake, so the specific
|
||||||
example might not be apt for future versions.
|
example might not be apt for future versions.
|
||||||
|
|
||||||
|
Compatible with: spaCy v2.0.0+
|
||||||
|
Last tested with: v2.1.0
|
||||||
"""
|
"""
|
||||||
import plac
|
import plac
|
||||||
import spacy
|
import spacy
|
||||||
|
@ -25,8 +28,10 @@ def prevent_sentence_boundaries(doc):
|
||||||
def can_be_sentence_start(token):
|
def can_be_sentence_start(token):
|
||||||
if token.i == 0:
|
if token.i == 0:
|
||||||
return True
|
return True
|
||||||
elif token.is_title:
|
# We're not checking for is_title here to ignore arbitrary titlecased
|
||||||
return True
|
# tokens within sentences
|
||||||
|
# elif token.is_title:
|
||||||
|
# return True
|
||||||
elif token.nbor(-1).is_punct:
|
elif token.nbor(-1).is_punct:
|
||||||
return True
|
return True
|
||||||
elif token.nbor(-1).is_space:
|
elif token.nbor(-1).is_space:
|
||||||
|
@ -35,16 +40,21 @@ def can_be_sentence_start(token):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def main():
|
@plac.annotations(
|
||||||
nlp = spacy.load("en_core_web_lg")
|
text=("The raw text to process", "positional", None, str),
|
||||||
raw_text = "Been here and I'm loving it."
|
spacy_model=("spaCy model to use (with a parser)", "option", "m", str),
|
||||||
doc = nlp(raw_text)
|
)
|
||||||
sentences = [sent.string.strip() for sent in doc.sents]
|
def main(text="Been here And I'm loving it.", spacy_model="en_core_web_lg"):
|
||||||
print(sentences)
|
print("Using spaCy model '{}'".format(spacy_model))
|
||||||
|
print("Processing text '{}'".format(text))
|
||||||
|
nlp = spacy.load(spacy_model)
|
||||||
|
doc = nlp(text)
|
||||||
|
sentences = [sent.text.strip() for sent in doc.sents]
|
||||||
|
print("Before:", sentences)
|
||||||
nlp.add_pipe(prevent_sentence_boundaries, before="parser")
|
nlp.add_pipe(prevent_sentence_boundaries, before="parser")
|
||||||
doc = nlp(raw_text)
|
doc = nlp(text)
|
||||||
sentences = [sent.string.strip() for sent in doc.sents]
|
sentences = [sent.text.strip() for sent in doc.sents]
|
||||||
print(sentences)
|
print("After:", sentences)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -1,7 +1,14 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# coding: utf8
|
||||||
"""Demonstrate adding a rule-based component that forces some tokens to not
|
"""Demonstrate adding a rule-based component that forces some tokens to not
|
||||||
be entities, before the NER tagger is applied. This is used to hotfix the issue
|
be entities, before the NER tagger is applied. This is used to hotfix the issue
|
||||||
in https://github.com/explosion/spaCy/issues/2870 , present as of spaCy v2.0.16.
|
in https://github.com/explosion/spaCy/issues/2870, present as of spaCy v2.0.16.
|
||||||
|
|
||||||
|
Compatible with: spaCy v2.0.0+
|
||||||
|
Last tested with: v2.1.0
|
||||||
"""
|
"""
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.attrs import ENT_IOB
|
from spacy.attrs import ENT_IOB
|
||||||
|
|
||||||
|
@ -18,7 +25,7 @@ def fix_space_tags(doc):
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
nlp = spacy.load("en_core_web_sm")
|
nlp = spacy.load("en_core_web_sm")
|
||||||
text = u"""This is some crazy test where I dont need an Apple Watch to make things bug"""
|
text = "This is some crazy test where I dont need an Apple Watch to make things bug"
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
print("Before", doc.ents)
|
print("Before", doc.ents)
|
||||||
nlp.add_pipe(fix_space_tags, name="fix-ner", before="ner")
|
nlp.add_pipe(fix_space_tags, name="fix-ner", before="ner")
|
||||||
|
|
|
@ -7,6 +7,8 @@ the IMDB movie reviews dataset and will be loaded automatically via Thinc's
|
||||||
built-in dataset loader.
|
built-in dataset loader.
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
Compatible with: spaCy v2.0.0+
|
||||||
|
Last tested with: v2.1.0
|
||||||
|
Prerequisites: pip install joblib
|
||||||
"""
|
"""
|
||||||
from __future__ import print_function, unicode_literals
|
from __future__ import print_function, unicode_literals
|
||||||
|
|
||||||
|
|
|
@ -8,6 +8,7 @@ For more details, see the documentation:
|
||||||
* NER: https://spacy.io/usage/linguistic-features#named-entities
|
* NER: https://spacy.io/usage/linguistic-features#named-entities
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
Compatible with: spaCy v2.0.0+
|
||||||
|
Last tested with: v2.1.0
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
|
|
|
@ -24,6 +24,7 @@ For more details, see the documentation:
|
||||||
* NER: https://spacy.io/usage/linguistic-features#named-entities
|
* NER: https://spacy.io/usage/linguistic-features#named-entities
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
Compatible with: spaCy v2.0.0+
|
||||||
|
Last tested with: v2.1.0
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
|
@ -87,7 +88,7 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
|
||||||
|
|
||||||
ner.add_label(LABEL) # add new entity label to entity recognizer
|
ner.add_label(LABEL) # add new entity label to entity recognizer
|
||||||
# Adding extraneous labels shouldn't mess anything up
|
# Adding extraneous labels shouldn't mess anything up
|
||||||
ner.add_label('VEGETABLE')
|
ner.add_label("VEGETABLE")
|
||||||
if model is None:
|
if model is None:
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.begin_training()
|
||||||
else:
|
else:
|
||||||
|
@ -127,7 +128,7 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
|
||||||
print("Loading from", output_dir)
|
print("Loading from", output_dir)
|
||||||
nlp2 = spacy.load(output_dir)
|
nlp2 = spacy.load(output_dir)
|
||||||
# Check the classes have loaded back consistently
|
# Check the classes have loaded back consistently
|
||||||
assert nlp2.get_pipe('ner').move_names == move_names
|
assert nlp2.get_pipe("ner").move_names == move_names
|
||||||
doc2 = nlp2(test_text)
|
doc2 = nlp2(test_text)
|
||||||
for ent in doc2.ents:
|
for ent in doc2.ents:
|
||||||
print(ent.label_, ent.text)
|
print(ent.label_, ent.text)
|
||||||
|
|
|
@ -6,6 +6,7 @@ model or a blank model. For more details, see the documentation:
|
||||||
* Dependency Parse: https://spacy.io/usage/linguistic-features#dependency-parse
|
* Dependency Parse: https://spacy.io/usage/linguistic-features#dependency-parse
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
Compatible with: spaCy v2.0.0+
|
||||||
|
Last tested with: v2.1.0
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
|
@ -40,7 +41,7 @@ TRAIN_DATA = [
|
||||||
output_dir=("Optional output directory", "option", "o", Path),
|
output_dir=("Optional output directory", "option", "o", Path),
|
||||||
n_iter=("Number of training iterations", "option", "n", int),
|
n_iter=("Number of training iterations", "option", "n", int),
|
||||||
)
|
)
|
||||||
def main(model=None, output_dir=None, n_iter=10):
|
def main(model=None, output_dir=None, n_iter=15):
|
||||||
"""Load the model, set up the pipeline and train the parser."""
|
"""Load the model, set up the pipeline and train the parser."""
|
||||||
if model is not None:
|
if model is not None:
|
||||||
nlp = spacy.load(model) # load existing spaCy model
|
nlp = spacy.load(model) # load existing spaCy model
|
||||||
|
|
|
@ -9,6 +9,7 @@ the documentation:
|
||||||
* POS Tagging: https://spacy.io/usage/linguistic-features#pos-tagging
|
* POS Tagging: https://spacy.io/usage/linguistic-features#pos-tagging
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
Compatible with: spaCy v2.0.0+
|
||||||
|
Last tested with: v2.1.0
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
|
|
|
@ -41,9 +41,9 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
|
||||||
# add the text classifier to the pipeline if it doesn't exist
|
# add the text classifier to the pipeline if it doesn't exist
|
||||||
# nlp.create_pipe works for built-ins that are registered with spaCy
|
# nlp.create_pipe works for built-ins that are registered with spaCy
|
||||||
if "textcat" not in nlp.pipe_names:
|
if "textcat" not in nlp.pipe_names:
|
||||||
textcat = nlp.create_pipe("textcat", config={
|
textcat = nlp.create_pipe(
|
||||||
"architecture": "simple_cnn",
|
"textcat", config={"architecture": "simple_cnn", "exclusive_classes": True}
|
||||||
"exclusive_classes": True})
|
)
|
||||||
nlp.add_pipe(textcat, last=True)
|
nlp.add_pipe(textcat, last=True)
|
||||||
# otherwise, get it, so we can add labels to it
|
# otherwise, get it, so we can add labels to it
|
||||||
else:
|
else:
|
||||||
|
@ -140,7 +140,7 @@ def evaluate(tokenizer, textcat, texts, cats):
|
||||||
fn += 1
|
fn += 1
|
||||||
precision = tp / (tp + fp)
|
precision = tp / (tp + fp)
|
||||||
recall = tp / (tp + fn)
|
recall = tp / (tp + fn)
|
||||||
if (precision+recall) == 0:
|
if (precision + recall) == 0:
|
||||||
f_score = 0.0
|
f_score = 0.0
|
||||||
else:
|
else:
|
||||||
f_score = 2 * (precision * recall) / (precision + recall)
|
f_score = 2 * (precision * recall) / (precision + recall)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user