mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-10 09:16:31 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
30d6c2ccc2
8
CITATION
8
CITATION
|
@ -1,6 +1,6 @@
|
|||
@ARTICLE{spacy2,
|
||||
AUTHOR = {Honnibal, Matthew AND Montani, Ines},
|
||||
TITLE = {spaCy 2: Natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing},
|
||||
@unpublished{spacy2,
|
||||
AUTHOR = {Honnibal, Matthew and Montani, Ines},
|
||||
TITLE = {{spaCy 2}: Natural language understanding with {B}loom embeddings, convolutional neural networks and incremental parsing},
|
||||
YEAR = {2017},
|
||||
JOURNAL = {To appear}
|
||||
Note = {To appear}
|
||||
}
|
||||
|
|
|
@ -23,19 +23,39 @@ from .train import _load_pretrained_tok2vec
|
|||
|
||||
|
||||
@plac.annotations(
|
||||
texts_loc=("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the "
|
||||
"key 'tokens'", "positional", None, str),
|
||||
texts_loc=(
|
||||
"Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the "
|
||||
"key 'tokens'",
|
||||
"positional",
|
||||
None,
|
||||
str,
|
||||
),
|
||||
vectors_model=("Name or path to spaCy model with vectors to learn from"),
|
||||
output_dir=("Directory to write models to on each epoch", "positional", None, str),
|
||||
width=("Width of CNN layers", "option", "cw", int),
|
||||
depth=("Depth of CNN layers", "option", "cd", int),
|
||||
embed_rows=("Number of embedding rows", "option", "er", int),
|
||||
loss_func=("Loss function to use for the objective. Either 'L2' or 'cosine'", "option", "L", str),
|
||||
loss_func=(
|
||||
"Loss function to use for the objective. Either 'L2' or 'cosine'",
|
||||
"option",
|
||||
"L",
|
||||
str,
|
||||
),
|
||||
use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
|
||||
dropout=("Dropout rate", "option", "d", float),
|
||||
batch_size=("Number of words per training batch", "option", "bs", int),
|
||||
max_length=("Max words per example. Longer examples are discarded", "option", "xw", int),
|
||||
min_length=("Min words per example. Shorter examples are discarded", "option", "nw", int),
|
||||
max_length=(
|
||||
"Max words per example. Longer examples are discarded",
|
||||
"option",
|
||||
"xw",
|
||||
int,
|
||||
),
|
||||
min_length=(
|
||||
"Min words per example. Shorter examples are discarded",
|
||||
"option",
|
||||
"nw",
|
||||
int,
|
||||
),
|
||||
seed=("Seed for random number generators", "option", "s", int),
|
||||
n_iter=("Number of iterations to pretrain", "option", "i", int),
|
||||
n_save_every=("Save model every X batches.", "option", "se", int),
|
||||
|
|
|
@ -116,7 +116,7 @@ def parse_deps(orig_doc, options={}):
|
|||
doc (Doc): Document do parse.
|
||||
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
||||
"""
|
||||
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
|
||||
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
|
||||
if not doc.is_parsed:
|
||||
user_warning(Warnings.W005)
|
||||
if options.get("collapse_phrases", False):
|
||||
|
|
|
@ -537,6 +537,7 @@ for orth in [
|
|||
"Sen.",
|
||||
"St.",
|
||||
"vs.",
|
||||
"v.s."
|
||||
]:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
|
|
15
spacy/tests/regression/test_issue3882.py
Normal file
15
spacy/tests/regression/test_issue3882.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.displacy import parse_deps
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_issue3882(en_vocab):
|
||||
"""Test that displaCy doesn't serialize the doc.user_data when making a
|
||||
copy of the Doc.
|
||||
"""
|
||||
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||
doc.is_parsed = True
|
||||
doc.user_data["test"] = set()
|
||||
parse_deps(doc)
|
|
@ -305,11 +305,11 @@ match on the uppercase versions, in case someone has written it as "Google i/o".
|
|||
|
||||
```python
|
||||
### {executable="true"}
|
||||
import spacy
|
||||
from spacy.lang.en import English
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Span
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
nlp = English()
|
||||
matcher = Matcher(nlp.vocab)
|
||||
|
||||
def add_event_ent(matcher, doc, i, matches):
|
||||
|
@ -322,7 +322,7 @@ def add_event_ent(matcher, doc, i, matches):
|
|||
|
||||
pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
|
||||
matcher.add("GoogleIO", add_event_ent, pattern)
|
||||
doc = nlp(u"This is a text about Google I/O.")
|
||||
doc = nlp(u"This is a text about Google I/O")
|
||||
matches = matcher(doc)
|
||||
```
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user