mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-10 17:26:42 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
30d6c2ccc2
10
CITATION
10
CITATION
|
@ -1,6 +1,6 @@
|
||||||
@ARTICLE{spacy2,
|
@unpublished{spacy2,
|
||||||
AUTHOR = {Honnibal, Matthew AND Montani, Ines},
|
AUTHOR = {Honnibal, Matthew and Montani, Ines},
|
||||||
TITLE = {spaCy 2: Natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing},
|
TITLE = {{spaCy 2}: Natural language understanding with {B}loom embeddings, convolutional neural networks and incremental parsing},
|
||||||
YEAR = {2017},
|
YEAR = {2017},
|
||||||
JOURNAL = {To appear}
|
Note = {To appear}
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,19 +23,39 @@ from .train import _load_pretrained_tok2vec
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
texts_loc=("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the "
|
texts_loc=(
|
||||||
"key 'tokens'", "positional", None, str),
|
"Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the "
|
||||||
|
"key 'tokens'",
|
||||||
|
"positional",
|
||||||
|
None,
|
||||||
|
str,
|
||||||
|
),
|
||||||
vectors_model=("Name or path to spaCy model with vectors to learn from"),
|
vectors_model=("Name or path to spaCy model with vectors to learn from"),
|
||||||
output_dir=("Directory to write models to on each epoch", "positional", None, str),
|
output_dir=("Directory to write models to on each epoch", "positional", None, str),
|
||||||
width=("Width of CNN layers", "option", "cw", int),
|
width=("Width of CNN layers", "option", "cw", int),
|
||||||
depth=("Depth of CNN layers", "option", "cd", int),
|
depth=("Depth of CNN layers", "option", "cd", int),
|
||||||
embed_rows=("Number of embedding rows", "option", "er", int),
|
embed_rows=("Number of embedding rows", "option", "er", int),
|
||||||
loss_func=("Loss function to use for the objective. Either 'L2' or 'cosine'", "option", "L", str),
|
loss_func=(
|
||||||
|
"Loss function to use for the objective. Either 'L2' or 'cosine'",
|
||||||
|
"option",
|
||||||
|
"L",
|
||||||
|
str,
|
||||||
|
),
|
||||||
use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
|
use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
|
||||||
dropout=("Dropout rate", "option", "d", float),
|
dropout=("Dropout rate", "option", "d", float),
|
||||||
batch_size=("Number of words per training batch", "option", "bs", int),
|
batch_size=("Number of words per training batch", "option", "bs", int),
|
||||||
max_length=("Max words per example. Longer examples are discarded", "option", "xw", int),
|
max_length=(
|
||||||
min_length=("Min words per example. Shorter examples are discarded", "option", "nw", int),
|
"Max words per example. Longer examples are discarded",
|
||||||
|
"option",
|
||||||
|
"xw",
|
||||||
|
int,
|
||||||
|
),
|
||||||
|
min_length=(
|
||||||
|
"Min words per example. Shorter examples are discarded",
|
||||||
|
"option",
|
||||||
|
"nw",
|
||||||
|
int,
|
||||||
|
),
|
||||||
seed=("Seed for random number generators", "option", "s", int),
|
seed=("Seed for random number generators", "option", "s", int),
|
||||||
n_iter=("Number of iterations to pretrain", "option", "i", int),
|
n_iter=("Number of iterations to pretrain", "option", "i", int),
|
||||||
n_save_every=("Save model every X batches.", "option", "se", int),
|
n_save_every=("Save model every X batches.", "option", "se", int),
|
||||||
|
|
|
@ -116,7 +116,7 @@ def parse_deps(orig_doc, options={}):
|
||||||
doc (Doc): Document do parse.
|
doc (Doc): Document do parse.
|
||||||
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
||||||
"""
|
"""
|
||||||
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
|
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
user_warning(Warnings.W005)
|
user_warning(Warnings.W005)
|
||||||
if options.get("collapse_phrases", False):
|
if options.get("collapse_phrases", False):
|
||||||
|
|
|
@ -537,6 +537,7 @@ for orth in [
|
||||||
"Sen.",
|
"Sen.",
|
||||||
"St.",
|
"St.",
|
||||||
"vs.",
|
"vs.",
|
||||||
|
"v.s."
|
||||||
]:
|
]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
15
spacy/tests/regression/test_issue3882.py
Normal file
15
spacy/tests/regression/test_issue3882.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from spacy.displacy import parse_deps
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3882(en_vocab):
|
||||||
|
"""Test that displaCy doesn't serialize the doc.user_data when making a
|
||||||
|
copy of the Doc.
|
||||||
|
"""
|
||||||
|
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||||
|
doc.is_parsed = True
|
||||||
|
doc.user_data["test"] = set()
|
||||||
|
parse_deps(doc)
|
|
@ -305,11 +305,11 @@ match on the uppercase versions, in case someone has written it as "Google i/o".
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
import spacy
|
from spacy.lang.en import English
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
from spacy.tokens import Span
|
from spacy.tokens import Span
|
||||||
|
|
||||||
nlp = spacy.load("en_core_web_sm")
|
nlp = English()
|
||||||
matcher = Matcher(nlp.vocab)
|
matcher = Matcher(nlp.vocab)
|
||||||
|
|
||||||
def add_event_ent(matcher, doc, i, matches):
|
def add_event_ent(matcher, doc, i, matches):
|
||||||
|
@ -322,7 +322,7 @@ def add_event_ent(matcher, doc, i, matches):
|
||||||
|
|
||||||
pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
|
pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
|
||||||
matcher.add("GoogleIO", add_event_ent, pattern)
|
matcher.add("GoogleIO", add_event_ent, pattern)
|
||||||
doc = nlp(u"This is a text about Google I/O.")
|
doc = nlp(u"This is a text about Google I/O")
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user