mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-07 15:56:32 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
e8f284c741
|
@ -48,4 +48,6 @@ redirects = [
|
||||||
{from = "/api/sentencesegmenter", to="/api/sentencizer"},
|
{from = "/api/sentencesegmenter", to="/api/sentencizer"},
|
||||||
{from = "/universe", to = "/universe/project/:id", query = {id = ":id"}, force = true},
|
{from = "/universe", to = "/universe/project/:id", query = {id = ":id"}, force = true},
|
||||||
{from = "/universe", to = "/universe/category/:category", query = {category = ":category"}, force = true},
|
{from = "/universe", to = "/universe/category/:category", query = {category = ":category"}, force = true},
|
||||||
|
# Renamed universe projects
|
||||||
|
{from = "/universe/project/spacy-pytorch-transformers", to = "/universe/project/spacy-transformers", force = true}
|
||||||
]
|
]
|
||||||
|
|
|
@ -11,6 +11,7 @@ numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
plac<1.0.0,>=0.9.6
|
plac<1.0.0,>=0.9.6
|
||||||
pathlib==1.0.1; python_version < "3.4"
|
pathlib==1.0.1; python_version < "3.4"
|
||||||
|
importlib_metadata>=0.23; python_version < "3.8"
|
||||||
# Optional dependencies
|
# Optional dependencies
|
||||||
jsonschema>=2.6.0,<3.1.0
|
jsonschema>=2.6.0,<3.1.0
|
||||||
# Development dependencies
|
# Development dependencies
|
||||||
|
|
|
@ -39,6 +39,7 @@ setup_requires =
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=7.1.1,<7.2.0
|
thinc>=7.1.1,<7.2.0
|
||||||
install_requires =
|
install_requires =
|
||||||
|
setuptools
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
|
@ -50,6 +51,7 @@ install_requires =
|
||||||
wasabi>=0.2.0,<1.1.0
|
wasabi>=0.2.0,<1.1.0
|
||||||
srsly>=0.1.0,<1.1.0
|
srsly>=0.1.0,<1.1.0
|
||||||
pathlib==1.0.1; python_version < "3.4"
|
pathlib==1.0.1; python_version < "3.4"
|
||||||
|
importlib_metadata>=0.23; python_version < "3.8"
|
||||||
|
|
||||||
[options.extras_require]
|
[options.extras_require]
|
||||||
lookups =
|
lookups =
|
||||||
|
|
|
@ -14,6 +14,7 @@ from .glossary import explain
|
||||||
from .about import __version__
|
from .about import __version__
|
||||||
from .errors import Errors, Warnings, deprecation_warning
|
from .errors import Errors, Warnings, deprecation_warning
|
||||||
from . import util
|
from . import util
|
||||||
|
from .util import register_architecture, get_architecture
|
||||||
|
|
||||||
|
|
||||||
if sys.maxunicode == 65535:
|
if sys.maxunicode == 65535:
|
||||||
|
|
14
spacy/_ml.py
14
spacy/_ml.py
|
@ -953,16 +953,24 @@ class CharacterEmbed(Model):
|
||||||
return output, backprop_character_embed
|
return output, backprop_character_embed
|
||||||
|
|
||||||
|
|
||||||
def get_cossim_loss(yh, y):
|
def get_cossim_loss(yh, y, ignore_zeros=False):
|
||||||
|
xp = get_array_module(yh)
|
||||||
|
# Find the zero vectors
|
||||||
|
if ignore_zeros:
|
||||||
|
zero_indices = xp.abs(y).sum(axis=1) == 0
|
||||||
# Add a small constant to avoid 0 vectors
|
# Add a small constant to avoid 0 vectors
|
||||||
yh = yh + 1e-8
|
yh = yh + 1e-8
|
||||||
y = y + 1e-8
|
y = y + 1e-8
|
||||||
# https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity
|
# https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity
|
||||||
xp = get_array_module(yh)
|
|
||||||
norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True)
|
norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True)
|
||||||
norm_y = xp.linalg.norm(y, axis=1, keepdims=True)
|
norm_y = xp.linalg.norm(y, axis=1, keepdims=True)
|
||||||
mul_norms = norm_yh * norm_y
|
mul_norms = norm_yh * norm_y
|
||||||
cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms
|
cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms
|
||||||
d_yh = (y / mul_norms) - (cosine * (yh / norm_yh ** 2))
|
d_yh = (y / mul_norms) - (cosine * (yh / norm_yh ** 2))
|
||||||
loss = xp.abs(cosine - 1).sum()
|
losses = xp.abs(cosine - 1)
|
||||||
|
if ignore_zeros:
|
||||||
|
# If the target was a zero vector, don't count it in the loss.
|
||||||
|
d_yh[zero_indices] = 0
|
||||||
|
losses[zero_indices] = 0
|
||||||
|
loss = losses.sum()
|
||||||
return loss, -d_yh
|
return loss, -d_yh
|
||||||
|
|
|
@ -7,7 +7,7 @@ from ...gold import docs_to_json
|
||||||
from ...util import get_lang_class, minibatch
|
from ...util import get_lang_class, minibatch
|
||||||
|
|
||||||
|
|
||||||
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
|
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_):
|
||||||
if lang is None:
|
if lang is None:
|
||||||
raise ValueError("No --lang specified, but tokenization required")
|
raise ValueError("No --lang specified, but tokenization required")
|
||||||
json_docs = []
|
json_docs = []
|
||||||
|
|
|
@ -6,7 +6,6 @@ import requests
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import pkg_resources
|
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
from .link import link
|
from .link import link
|
||||||
|
@ -87,6 +86,8 @@ def download(model, direct=False, *pip_args):
|
||||||
|
|
||||||
def require_package(name):
|
def require_package(name):
|
||||||
try:
|
try:
|
||||||
|
import pkg_resources
|
||||||
|
|
||||||
pkg_resources.working_set.require(name)
|
pkg_resources.working_set.require(name)
|
||||||
return True
|
return True
|
||||||
except: # noqa: E722
|
except: # noqa: E722
|
||||||
|
|
|
@ -35,6 +35,7 @@ from .train import _load_pretrained_tok2vec
|
||||||
output_dir=("Directory to write models to on each epoch", "positional", None, str),
|
output_dir=("Directory to write models to on each epoch", "positional", None, str),
|
||||||
width=("Width of CNN layers", "option", "cw", int),
|
width=("Width of CNN layers", "option", "cw", int),
|
||||||
depth=("Depth of CNN layers", "option", "cd", int),
|
depth=("Depth of CNN layers", "option", "cd", int),
|
||||||
|
bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int),
|
||||||
embed_rows=("Number of embedding rows", "option", "er", int),
|
embed_rows=("Number of embedding rows", "option", "er", int),
|
||||||
loss_func=(
|
loss_func=(
|
||||||
"Loss function to use for the objective. Either 'L2' or 'cosine'",
|
"Loss function to use for the objective. Either 'L2' or 'cosine'",
|
||||||
|
@ -80,6 +81,7 @@ def pretrain(
|
||||||
output_dir,
|
output_dir,
|
||||||
width=96,
|
width=96,
|
||||||
depth=4,
|
depth=4,
|
||||||
|
bilstm_depth=2,
|
||||||
embed_rows=2000,
|
embed_rows=2000,
|
||||||
loss_func="cosine",
|
loss_func="cosine",
|
||||||
use_vectors=False,
|
use_vectors=False,
|
||||||
|
@ -116,6 +118,10 @@ def pretrain(
|
||||||
util.fix_random_seed(seed)
|
util.fix_random_seed(seed)
|
||||||
|
|
||||||
has_gpu = prefer_gpu()
|
has_gpu = prefer_gpu()
|
||||||
|
if has_gpu:
|
||||||
|
import torch
|
||||||
|
|
||||||
|
torch.set_default_tensor_type("torch.cuda.FloatTensor")
|
||||||
msg.info("Using GPU" if has_gpu else "Not using GPU")
|
msg.info("Using GPU" if has_gpu else "Not using GPU")
|
||||||
|
|
||||||
output_dir = Path(output_dir)
|
output_dir = Path(output_dir)
|
||||||
|
@ -151,7 +157,7 @@ def pretrain(
|
||||||
embed_rows,
|
embed_rows,
|
||||||
conv_depth=depth,
|
conv_depth=depth,
|
||||||
pretrained_vectors=pretrained_vectors,
|
pretrained_vectors=pretrained_vectors,
|
||||||
bilstm_depth=0, # Requires PyTorch. Experimental.
|
bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental.
|
||||||
cnn_maxout_pieces=3, # You can try setting this higher
|
cnn_maxout_pieces=3, # You can try setting this higher
|
||||||
subword_features=True, # Set to False for Chinese etc
|
subword_features=True, # Set to False for Chinese etc
|
||||||
),
|
),
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import pkg_resources
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sys
|
import sys
|
||||||
import requests
|
import requests
|
||||||
|
@ -109,6 +108,8 @@ def get_model_links(compat):
|
||||||
|
|
||||||
|
|
||||||
def get_model_pkgs(compat, all_models):
|
def get_model_pkgs(compat, all_models):
|
||||||
|
import pkg_resources
|
||||||
|
|
||||||
pkgs = {}
|
pkgs = {}
|
||||||
for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
|
for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
|
||||||
package = pkg_name.replace("-", "_")
|
package = pkg_name.replace("-", "_")
|
||||||
|
|
|
@ -35,6 +35,11 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
cupy = None
|
cupy = None
|
||||||
|
|
||||||
|
try: # Python 3.8
|
||||||
|
import importlib.metadata as importlib_metadata
|
||||||
|
except ImportError:
|
||||||
|
import importlib_metadata # noqa: F401
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from thinc.neural.optimizers import Optimizer # noqa: F401
|
from thinc.neural.optimizers import Optimizer # noqa: F401
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
|
|
@ -95,6 +95,7 @@ class Warnings(object):
|
||||||
"you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. "
|
"you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. "
|
||||||
"If this is surprising, make sure you have the spacy-lookups-data "
|
"If this is surprising, make sure you have the spacy-lookups-data "
|
||||||
"package installed.")
|
"package installed.")
|
||||||
|
W023 = ("Multiprocessing of Language.pipe is not supported in Python2. 'n_process' will be set to 1.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
@ -495,6 +496,8 @@ class Errors(object):
|
||||||
E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of "
|
E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of "
|
||||||
"Lookups containing the lemmatization tables. See the docs for "
|
"Lookups containing the lemmatization tables. See the docs for "
|
||||||
"details: https://spacy.io/api/lemmatizer#init")
|
"details: https://spacy.io/api/lemmatizer#init")
|
||||||
|
E174 = ("Architecture '{name}' not found in registry. Available "
|
||||||
|
"names: {names}")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -184,7 +184,7 @@ _russian_lower = r"ёа-я"
|
||||||
_russian_upper = r"ЁА-Я"
|
_russian_upper = r"ЁА-Я"
|
||||||
_russian = r"ёа-яЁА-Я"
|
_russian = r"ёа-яЁА-Я"
|
||||||
|
|
||||||
_sinhala = r"\u0D85-\u0D96\u0D9A-\u0DB1\u0DB3-\u0DBB\u0DBD\u0DC0-\u0DC6"
|
_sinhala = r"\u0D80-\u0DFF"
|
||||||
|
|
||||||
_tatar_lower = r"әөүҗңһ"
|
_tatar_lower = r"әөүҗңһ"
|
||||||
_tatar_upper = r"ӘӨҮҖҢҺ"
|
_tatar_upper = r"ӘӨҮҖҢҺ"
|
||||||
|
|
|
@ -1,8 +1,11 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import absolute_import, unicode_literals
|
from __future__ import absolute_import, unicode_literals
|
||||||
|
|
||||||
|
import atexit
|
||||||
import random
|
import random
|
||||||
import itertools
|
import itertools
|
||||||
|
from warnings import warn
|
||||||
|
from spacy.util import minibatch
|
||||||
import weakref
|
import weakref
|
||||||
import functools
|
import functools
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
@ -10,6 +13,8 @@ from contextlib import contextmanager
|
||||||
from copy import copy, deepcopy
|
from copy import copy, deepcopy
|
||||||
from thinc.neural import Model
|
from thinc.neural import Model
|
||||||
import srsly
|
import srsly
|
||||||
|
import multiprocessing as mp
|
||||||
|
from itertools import chain, cycle
|
||||||
|
|
||||||
from .tokenizer import Tokenizer
|
from .tokenizer import Tokenizer
|
||||||
from .vocab import Vocab
|
from .vocab import Vocab
|
||||||
|
@ -21,7 +26,7 @@ from .pipeline import SimilarityHook, TextCategorizer, Sentencizer
|
||||||
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
|
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
|
||||||
from .pipeline import EntityRuler
|
from .pipeline import EntityRuler
|
||||||
from .pipeline import Morphologizer
|
from .pipeline import Morphologizer
|
||||||
from .compat import izip, basestring_
|
from .compat import izip, basestring_, is_python2
|
||||||
from .gold import GoldParse
|
from .gold import GoldParse
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from ._ml import link_vectors_to_models, create_default_optimizer
|
from ._ml import link_vectors_to_models, create_default_optimizer
|
||||||
|
@ -30,8 +35,9 @@ from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
from .lang.punctuation import TOKENIZER_INFIXES
|
from .lang.punctuation import TOKENIZER_INFIXES
|
||||||
from .lang.tokenizer_exceptions import TOKEN_MATCH
|
from .lang.tokenizer_exceptions import TOKEN_MATCH
|
||||||
from .lang.tag_map import TAG_MAP
|
from .lang.tag_map import TAG_MAP
|
||||||
|
from .tokens import Doc
|
||||||
from .lang.lex_attrs import LEX_ATTRS, is_stop
|
from .lang.lex_attrs import LEX_ATTRS, is_stop
|
||||||
from .errors import Errors, Warnings, deprecation_warning
|
from .errors import Errors, Warnings, deprecation_warning, user_warning
|
||||||
from . import util
|
from . import util
|
||||||
from . import about
|
from . import about
|
||||||
|
|
||||||
|
@ -733,6 +739,7 @@ class Language(object):
|
||||||
disable=[],
|
disable=[],
|
||||||
cleanup=False,
|
cleanup=False,
|
||||||
component_cfg=None,
|
component_cfg=None,
|
||||||
|
n_process=1,
|
||||||
):
|
):
|
||||||
"""Process texts as a stream, and yield `Doc` objects in order.
|
"""Process texts as a stream, and yield `Doc` objects in order.
|
||||||
|
|
||||||
|
@ -746,12 +753,20 @@ class Language(object):
|
||||||
use. Experimental.
|
use. Experimental.
|
||||||
component_cfg (dict): An optional dictionary with extra keyword
|
component_cfg (dict): An optional dictionary with extra keyword
|
||||||
arguments for specific components.
|
arguments for specific components.
|
||||||
|
n_process (int): Number of processors to process texts, only supported in Python3. If -1, set `multiprocessing.cpu_count()`.
|
||||||
YIELDS (Doc): Documents in the order of the original text.
|
YIELDS (Doc): Documents in the order of the original text.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#pipe
|
DOCS: https://spacy.io/api/language#pipe
|
||||||
"""
|
"""
|
||||||
|
# raw_texts will be used later to stop iterator.
|
||||||
|
texts, raw_texts = itertools.tee(texts)
|
||||||
|
if is_python2 and n_process != 1:
|
||||||
|
user_warning(Warnings.W023)
|
||||||
|
n_process = 1
|
||||||
if n_threads != -1:
|
if n_threads != -1:
|
||||||
deprecation_warning(Warnings.W016)
|
deprecation_warning(Warnings.W016)
|
||||||
|
if n_process == -1:
|
||||||
|
n_process = mp.cpu_count()
|
||||||
if as_tuples:
|
if as_tuples:
|
||||||
text_context1, text_context2 = itertools.tee(texts)
|
text_context1, text_context2 = itertools.tee(texts)
|
||||||
texts = (tc[0] for tc in text_context1)
|
texts = (tc[0] for tc in text_context1)
|
||||||
|
@ -765,9 +780,12 @@ class Language(object):
|
||||||
for doc, context in izip(docs, contexts):
|
for doc, context in izip(docs, contexts):
|
||||||
yield (doc, context)
|
yield (doc, context)
|
||||||
return
|
return
|
||||||
docs = (self.make_doc(text) for text in texts)
|
|
||||||
if component_cfg is None:
|
if component_cfg is None:
|
||||||
component_cfg = {}
|
component_cfg = {}
|
||||||
|
|
||||||
|
pipes = (
|
||||||
|
[]
|
||||||
|
) # contains functools.partial objects so that easily create multiprocess worker.
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if name in disable:
|
if name in disable:
|
||||||
continue
|
continue
|
||||||
|
@ -775,10 +793,20 @@ class Language(object):
|
||||||
# Allow component_cfg to overwrite the top-level kwargs.
|
# Allow component_cfg to overwrite the top-level kwargs.
|
||||||
kwargs.setdefault("batch_size", batch_size)
|
kwargs.setdefault("batch_size", batch_size)
|
||||||
if hasattr(proc, "pipe"):
|
if hasattr(proc, "pipe"):
|
||||||
docs = proc.pipe(docs, **kwargs)
|
f = functools.partial(proc.pipe, **kwargs)
|
||||||
else:
|
else:
|
||||||
# Apply the function, but yield the doc
|
# Apply the function, but yield the doc
|
||||||
docs = _pipe(proc, docs, kwargs)
|
f = functools.partial(_pipe, proc=proc, kwargs=kwargs)
|
||||||
|
pipes.append(f)
|
||||||
|
|
||||||
|
if n_process != 1:
|
||||||
|
docs = self._multiprocessing_pipe(texts, pipes, n_process, batch_size)
|
||||||
|
else:
|
||||||
|
# if n_process == 1, no processes are forked.
|
||||||
|
docs = (self.make_doc(text) for text in texts)
|
||||||
|
for pipe in pipes:
|
||||||
|
docs = pipe(docs)
|
||||||
|
|
||||||
# Track weakrefs of "recent" documents, so that we can see when they
|
# Track weakrefs of "recent" documents, so that we can see when they
|
||||||
# expire from memory. When they do, we know we don't need old strings.
|
# expire from memory. When they do, we know we don't need old strings.
|
||||||
# This way, we avoid maintaining an unbounded growth in string entries
|
# This way, we avoid maintaining an unbounded growth in string entries
|
||||||
|
@ -809,6 +837,46 @@ class Language(object):
|
||||||
self.tokenizer._reset_cache(keys)
|
self.tokenizer._reset_cache(keys)
|
||||||
nr_seen = 0
|
nr_seen = 0
|
||||||
|
|
||||||
|
def _multiprocessing_pipe(self, texts, pipes, n_process, batch_size):
|
||||||
|
# raw_texts is used later to stop iteration.
|
||||||
|
texts, raw_texts = itertools.tee(texts)
|
||||||
|
# for sending texts to worker
|
||||||
|
texts_q = [mp.Queue() for _ in range(n_process)]
|
||||||
|
# for receiving byte encoded docs from worker
|
||||||
|
bytedocs_recv_ch, bytedocs_send_ch = zip(
|
||||||
|
*[mp.Pipe(False) for _ in range(n_process)]
|
||||||
|
)
|
||||||
|
|
||||||
|
batch_texts = minibatch(texts, batch_size)
|
||||||
|
# Sender sends texts to the workers.
|
||||||
|
# This is necessary to properly handle infinite length of texts.
|
||||||
|
# (In this case, all data cannot be sent to the workers at once)
|
||||||
|
sender = _Sender(batch_texts, texts_q, chunk_size=n_process)
|
||||||
|
# send twice so that make process busy
|
||||||
|
sender.send()
|
||||||
|
sender.send()
|
||||||
|
|
||||||
|
procs = [
|
||||||
|
mp.Process(target=_apply_pipes, args=(self.make_doc, pipes, rch, sch))
|
||||||
|
for rch, sch in zip(texts_q, bytedocs_send_ch)
|
||||||
|
]
|
||||||
|
for proc in procs:
|
||||||
|
proc.start()
|
||||||
|
|
||||||
|
# Cycle channels not to break the order of docs.
|
||||||
|
# The received object is batch of byte encoded docs, so flatten them with chain.from_iterable.
|
||||||
|
byte_docs = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch))
|
||||||
|
docs = (Doc(self.vocab).from_bytes(byte_doc) for byte_doc in byte_docs)
|
||||||
|
try:
|
||||||
|
for i, (_, doc) in enumerate(zip(raw_texts, docs), 1):
|
||||||
|
yield doc
|
||||||
|
if i % batch_size == 0:
|
||||||
|
# tell `sender` that one batch was consumed.
|
||||||
|
sender.step()
|
||||||
|
finally:
|
||||||
|
for proc in procs:
|
||||||
|
proc.terminate()
|
||||||
|
|
||||||
def to_disk(self, path, exclude=tuple(), disable=None):
|
def to_disk(self, path, exclude=tuple(), disable=None):
|
||||||
"""Save the current state to a directory. If a model is loaded, this
|
"""Save the current state to a directory. If a model is loaded, this
|
||||||
will include the model.
|
will include the model.
|
||||||
|
@ -987,12 +1055,55 @@ class DisabledPipes(list):
|
||||||
self[:] = []
|
self[:] = []
|
||||||
|
|
||||||
|
|
||||||
def _pipe(func, docs, kwargs):
|
def _pipe(docs, proc, kwargs):
|
||||||
# We added some args for pipe that __call__ doesn't expect.
|
# We added some args for pipe that __call__ doesn't expect.
|
||||||
kwargs = dict(kwargs)
|
kwargs = dict(kwargs)
|
||||||
for arg in ["n_threads", "batch_size"]:
|
for arg in ["n_threads", "batch_size"]:
|
||||||
if arg in kwargs:
|
if arg in kwargs:
|
||||||
kwargs.pop(arg)
|
kwargs.pop(arg)
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
doc = func(doc, **kwargs)
|
doc = proc(doc, **kwargs)
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_pipes(make_doc, pipes, reciever, sender):
|
||||||
|
"""Worker for Language.pipe
|
||||||
|
|
||||||
|
Args:
|
||||||
|
receiver (multiprocessing.Connection): Pipe to receive text. Usually created by `multiprocessing.Pipe()`
|
||||||
|
sender (multiprocessing.Connection): Pipe to send doc. Usually created by `multiprocessing.Pipe()`
|
||||||
|
"""
|
||||||
|
while True:
|
||||||
|
texts = reciever.get()
|
||||||
|
docs = (make_doc(text) for text in texts)
|
||||||
|
for pipe in pipes:
|
||||||
|
docs = pipe(docs)
|
||||||
|
# Connection does not accept unpickable objects, so send list.
|
||||||
|
sender.send([doc.to_bytes() for doc in docs])
|
||||||
|
|
||||||
|
|
||||||
|
class _Sender:
|
||||||
|
"""Util for sending data to multiprocessing workers in Language.pipe"""
|
||||||
|
|
||||||
|
def __init__(self, data, queues, chunk_size):
|
||||||
|
self.data = iter(data)
|
||||||
|
self.queues = iter(cycle(queues))
|
||||||
|
self.chunk_size = chunk_size
|
||||||
|
self.count = 0
|
||||||
|
|
||||||
|
def send(self):
|
||||||
|
"""Send chunk_size items from self.data to channels."""
|
||||||
|
for item, q in itertools.islice(
|
||||||
|
zip(self.data, cycle(self.queues)), self.chunk_size
|
||||||
|
):
|
||||||
|
# cycle channels so that distribute the texts evenly
|
||||||
|
q.put(item)
|
||||||
|
|
||||||
|
def step(self):
|
||||||
|
"""Tell sender that comsumed one item.
|
||||||
|
|
||||||
|
Data is sent to the workers after every chunk_size calls."""
|
||||||
|
self.count += 1
|
||||||
|
if self.count >= self.chunk_size:
|
||||||
|
self.count = 0
|
||||||
|
self.send()
|
||||||
|
|
|
@ -225,7 +225,7 @@ cdef class PhraseMatcher:
|
||||||
for i in range(c_matches.size()):
|
for i in range(c_matches.size()):
|
||||||
matches.append((c_matches[i].match_id, c_matches[i].start, c_matches[i].end))
|
matches.append((c_matches[i].match_id, c_matches[i].start, c_matches[i].end))
|
||||||
for i, (ent_id, start, end) in enumerate(matches):
|
for i, (ent_id, start, end) in enumerate(matches):
|
||||||
on_match = self._callbacks.get(ent_id)
|
on_match = self._callbacks.get(self.vocab.strings[ent_id])
|
||||||
if on_match is not None:
|
if on_match is not None:
|
||||||
on_match(self, doc, i, matches)
|
on_match(self, doc, i, matches)
|
||||||
return matches
|
return matches
|
||||||
|
|
|
@ -29,7 +29,7 @@ from .._ml import Tok2Vec, build_tagger_model, cosine, get_cossim_loss
|
||||||
from .._ml import build_text_classifier, build_simple_cnn_text_classifier
|
from .._ml import build_text_classifier, build_simple_cnn_text_classifier
|
||||||
from .._ml import build_bow_text_classifier, build_nel_encoder
|
from .._ml import build_bow_text_classifier, build_nel_encoder
|
||||||
from .._ml import link_vectors_to_models, zero_init, flatten
|
from .._ml import link_vectors_to_models, zero_init, flatten
|
||||||
from .._ml import masked_language_model, create_default_optimizer
|
from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss
|
||||||
from ..errors import Errors, TempErrors, user_warning, Warnings
|
from ..errors import Errors, TempErrors, user_warning, Warnings
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
@ -880,8 +880,7 @@ class ClozeMultitask(Pipe):
|
||||||
# and look them up all at once. This prevents data copying.
|
# and look them up all at once. This prevents data copying.
|
||||||
ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||||
target = vectors[ids]
|
target = vectors[ids]
|
||||||
gradient = (prediction - target) / prediction.shape[0]
|
loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True)
|
||||||
loss = (gradient**2).sum()
|
|
||||||
return float(loss), gradient
|
return float(loss), gradient
|
||||||
|
|
||||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||||
|
|
|
@ -3,6 +3,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import re
|
import re
|
||||||
|
from mock import Mock
|
||||||
from spacy.matcher import Matcher, DependencyMatcher
|
from spacy.matcher import Matcher, DependencyMatcher
|
||||||
from spacy.tokens import Doc, Token
|
from spacy.tokens import Doc, Token
|
||||||
|
|
||||||
|
@ -418,3 +419,13 @@ def test_matcher_valid_callback(en_vocab):
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
matcher.add("TEST", [], [{"TEXT": "test"}])
|
matcher.add("TEST", [], [{"TEXT": "test"}])
|
||||||
matcher(Doc(en_vocab, words=["test"]))
|
matcher(Doc(en_vocab, words=["test"]))
|
||||||
|
|
||||||
|
|
||||||
|
def test_matcher_callback(en_vocab):
|
||||||
|
mock = Mock()
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"ORTH": "test"}]
|
||||||
|
matcher.add("Rule", mock, pattern)
|
||||||
|
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
|
||||||
|
matches = matcher(doc)
|
||||||
|
mock.assert_called_once_with(matcher, doc, 0, matches)
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from mock import Mock
|
||||||
from spacy.matcher import PhraseMatcher
|
from spacy.matcher import PhraseMatcher
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from ..util import get_doc
|
from ..util import get_doc
|
||||||
|
@ -215,3 +216,13 @@ def test_attr_pipeline_checks(en_vocab):
|
||||||
matcher.add("TEST3", None, doc3)
|
matcher.add("TEST3", None, doc3)
|
||||||
matcher = PhraseMatcher(en_vocab, attr="TEXT")
|
matcher = PhraseMatcher(en_vocab, attr="TEXT")
|
||||||
matcher.add("TEST3", None, doc3)
|
matcher.add("TEST3", None, doc3)
|
||||||
|
|
||||||
|
|
||||||
|
def test_phrase_matcher_callback(en_vocab):
|
||||||
|
mock = Mock()
|
||||||
|
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
|
||||||
|
pattern = Doc(en_vocab, words=["Google", "Now"])
|
||||||
|
matcher = PhraseMatcher(en_vocab)
|
||||||
|
matcher.add("COMPANY", mock, pattern)
|
||||||
|
matches = matcher(doc)
|
||||||
|
mock.assert_called_once_with(matcher, doc, 0, matches)
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
import spacy
|
||||||
from spacy.pipeline import Sentencizer
|
from spacy.pipeline import Sentencizer
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
@ -85,3 +86,26 @@ def test_sentencizer_serialize_bytes(en_vocab):
|
||||||
bytes_data = sentencizer.to_bytes()
|
bytes_data = sentencizer.to_bytes()
|
||||||
new_sentencizer = Sentencizer().from_bytes(bytes_data)
|
new_sentencizer = Sentencizer().from_bytes(bytes_data)
|
||||||
assert new_sentencizer.punct_chars == set(punct_chars)
|
assert new_sentencizer.punct_chars == set(punct_chars)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
# fmt: off
|
||||||
|
"lang,text",
|
||||||
|
[
|
||||||
|
('bn', 'বাংলা ভাষা (বাঙলা, বাঙ্গলা, তথা বাঙ্গালা নামগুলোতেও পরিচিত) একটি ইন্দো-আর্য ভাষা, যা দক্ষিণ এশিয়ার বাঙালি জাতির প্রধান কথ্য ও লেখ্য ভাষা। মাতৃভাষীর সংখ্যায় বাংলা ইন্দো-ইউরোপীয় ভাষা পরিবারের চতুর্থ ও বিশ্বের ষষ্ঠ বৃহত্তম ভাষা।[৫] মোট ব্যবহারকারীর সংখ্যা অনুসারে বাংলা বিশ্বের সপ্তম বৃহত্তম ভাষা। বাংলা সার্বভৌম ভাষাভিত্তিক জাতিরাষ্ট্র বাংলাদেশের একমাত্র রাষ্ট্রভাষা তথা সরকারি ভাষা[৬] এবং ভারতের পশ্চিমবঙ্গ, ত্রিপুরা, আসামের বরাক উপত্যকার সরকারি ভাষা। বঙ্গোপসাগরে অবস্থিত আন্দামান দ্বীপপুঞ্জের প্রধান কথ্য ভাষা বাংলা। এছাড়া ভারতের ঝাড়খণ্ড, বিহার, মেঘালয়, মিজোরাম, উড়িষ্যা রাজ্যগুলোতে উল্লেখযোগ্য পরিমাণে বাংলাভাষী জনগণ রয়েছে। ভারতে হিন্দির পরেই সর্বাধিক প্রচলিত ভাষা বাংলা।[৭][৮] এছাড়াও মধ্য প্রাচ্য, আমেরিকা ও ইউরোপে উল্লেখযোগ্য পরিমাণে বাংলাভাষী অভিবাসী রয়েছে।[৯] সারা বিশ্বে সব মিলিয়ে ২৬ কোটির অধিক লোক দৈনন্দিন জীবনে বাংলা ব্যবহার করে।[২] বাংলাদেশের জাতীয় সঙ্গীত এবং ভারতের জাতীয় সঙ্গীত ও স্তোত্র বাংলাতে রচিত।'),
|
||||||
|
('de', 'Die deutsche Sprache bzw. Deutsch ([dɔʏ̯t͡ʃ]; abgekürzt dt. oder dtsch.) ist eine westgermanische Sprache. Ihr Sprachraum umfasst Deutschland, Österreich, die Deutschschweiz, Liechtenstein, Luxemburg, Ostbelgien, Südtirol, das Elsass und Lothringen sowie Nordschleswig. Außerdem ist sie eine Minderheitensprache in einigen europäischen und außereuropäischen Ländern, z. B. in Rumänien und Südafrika, sowie Nationalsprache im afrikanischen Namibia.'),
|
||||||
|
('hi', 'हिन्दी विश्व की एक प्रमुख भाषा है एवं भारत की राजभाषा है। केन्द्रीय स्तर पर भारत में दूसरी आधिकारिक भाषा अंग्रेजी है। यह हिंदुस्तानी भाषा की एक मानकीकृत रूप है जिसमें संस्कृत के तत्सम तथा तद्भव शब्दों का प्रयोग अधिक है और अरबी-फ़ारसी शब्द कम हैं। हिंदी संवैधानिक रूप से भारत की राजभाषा और भारत की सबसे अधिक बोली और समझी जाने वाली भाषा है। हालाँकि, हिन्दी भारत की राष्ट्रभाषा नहीं है,[3] क्योंकि भारत के संविधान में कोई भी भाषा को ऐसा दर्जा नहीं दिया गया था।[4][5] चीनी के बाद यह विश्व में सबसे अधिक बोली जाने वाली भाषा भी है। विश्व आर्थिक मंच की गणना के अनुसार यह विश्व की दस शक्तिशाली भाषाओं में से एक है।[6]'),
|
||||||
|
('kn', 'ದ್ರಾವಿಡ ಭಾಷೆಗಳಲ್ಲಿ ಪ್ರಾಮುಖ್ಯವುಳ್ಳ ಭಾಷೆಯೂ ಭಾರತದ ಪುರಾತನವಾದ ಭಾಷೆಗಳಲ್ಲಿ ಒಂದೂ ಆಗಿರುವ ಕನ್ನಡ ಭಾಷೆಯನ್ನು ಅದರ ವಿವಿಧ ರೂಪಗಳಲ್ಲಿ ಸುಮಾರು ೪೫ ದಶಲಕ್ಷ ಜನರು ಆಡು ನುಡಿಯಾಗಿ ಬಳಸುತ್ತಲಿದ್ದಾರೆ. ಕನ್ನಡ ಕರ್ನಾಟಕ ರಾಜ್ಯದ ಆಡಳಿತ ಭಾಷೆ.[೧೧] ಜಗತ್ತಿನಲ್ಲಿ ಅತ್ಯಂತ ಹೆಚ್ಚು ಮಂದಿ ಮಾತನಾಡುವ ಭಾಷೆಯೆಂಬ ನೆಲೆಯಲ್ಲಿ ಇಪ್ಪತೊಂಬತ್ತನೆಯ ಸ್ಥಾನ ಕನ್ನಡಕ್ಕಿದೆ. ೨೦೧೧ರ ಜನಗಣತಿಯ ಪ್ರಕಾರ ಜಗತ್ತಿನಲ್ಲಿ ೬.೪ ಕೋಟಿ ಜನಗಳು ಕನ್ನಡ ಮಾತನಾಡುತ್ತಾರೆ ಎಂದು ತಿಳಿದುಬಂದಿದೆ. ಇವರಲ್ಲಿ ೫.೫ ಕೋಟಿ ಜನಗಳ ಮಾತೃಭಾಷೆ ಕನ್ನಡವಾಗಿದೆ. ಬ್ರಾಹ್ಮಿ ಲಿಪಿಯಿಂದ ರೂಪುಗೊಂಡ ಕನ್ನಡ ಲಿಪಿಯನ್ನು ಉಪಯೋಗಿಸಿ ಕನ್ನಡ ಭಾಷೆಯನ್ನು ಬರೆಯಲಾಗುತ್ತದೆ. ಕನ್ನಡ ಬರಹದ ಮಾದರಿಗಳಿಗೆ ಸಾವಿರದ ಐನೂರು ವರುಷಗಳ ಚರಿತ್ರೆಯಿದೆ. ಕ್ರಿ.ಶ. ಆರನೆಯ ಶತಮಾನದ ಪಶ್ಚಿಮ ಗಂಗ ಸಾಮ್ರಾಜ್ಯದ ಕಾಲದಲ್ಲಿ [೧೨] ಮತ್ತು ಒಂಬತ್ತನೆಯ ಶತಮಾನದ ರಾಷ್ಟ್ರಕೂಟ ಸಾಮ್ರಾಜ್ಯದ ಕಾಲದಲ್ಲಿ ಹಳಗನ್ನಡ ಸಾಹಿತ್ಯ ಅತ್ಯಂತ ಹೆಚ್ಚಿನ ರಾಜಾಶ್ರಯ ಪಡೆಯಿತು.[೧೩][೧೪] ಅದಲ್ಲದೆ ಸಾವಿರ ವರುಷಗಳ ಸಾಹಿತ್ಯ ಪರಂಪರೆ ಕನ್ನಡಕ್ಕಿದೆ.[೧೫]ವಿನೋಬಾ ಭಾವೆ ಕನ್ನಡ ಲಿಪಿಯನ್ನು ಲಿಪಿಗಳ ರಾಣಿಯೆಂದು ಹೊಗಳಿದ್ದಾರೆ.[ಸೂಕ್ತ ಉಲ್ಲೇಖನ ಬೇಕು]'),
|
||||||
|
('si', 'ශ්රී ලංකාවේ ප්රධාන ජාතිය වන සිංහල ජනයාගේ මව් බස සිංහල වෙයි. අද වන විට මිලියන 20 කට අධික සිංහල සහ මිලියන 3කට අධික සිංහල නොවන ජනගහනයක් සිංහල භාෂාව භාවිත කරති. සිංහල ඉන්දු-යුරෝපීය භාෂාවල උප ගණයක් වන ඉන්දු-ආර්ය භාෂා ගණයට අයිති වන අතර මාල දිවයින භාවිත කරන දිවෙහි භාෂාව සිංහලයෙන් පැවත එන්නකි. සිංහල ශ්රී ලංකාවේ නිල භාෂාවයි .'),
|
||||||
|
('ta', 'தமிழ் மொழி (Tamil language) தமிழர்களினதும், தமிழ் பேசும் பலரதும் தாய்மொழி ஆகும். தமிழ் திராவிட மொழிக் குடும்பத்தின் முதன்மையான மொழிகளில் ஒன்றும் செம்மொழியும் ஆகும். இந்தியா, இலங்கை, மலேசியா, சிங்கப்பூர் ஆகிய நாடுகளில் அதிக அளவிலும், ஐக்கிய அரபு அமீரகம், தென்னாப்பிரிக்கா, மொரிசியசு, பிஜி, ரீயூனியன், டிரினிடாட் போன்ற நாடுகளில் சிறிய அளவிலும் தமிழ் பேசப்படுகிறது. 1997ஆம் ஆண்டுப் புள்ளி விவரப்படி உலகம் முழுவதிலும் 8 கோடி (80 மில்லியன்) மக்களால் பேசப்படும் தமிழ்[13], ஒரு மொழியைத் தாய்மொழியாகக் கொண்டு பேசும் மக்களின் எண்ணிக்கை அடிப்படையில் பதினெட்டாவது இடத்தில் உள்ளது.[14] இணையத்தில் அதிகம் பயன்படுத்தப்படும் இந்திய மொழிகளில் தமிழ் முதன்மையாக உள்ளதாக 2017 ஆவது ஆண்டில் நடைபெற்ற கூகுள் கணக்கெடுப்பில் தெரிய வந்தது.[15]'),
|
||||||
|
('te', 'ఆంధ్ర ప్రదేశ్, తెలంగాణ రాష్ట్రాల అధికార భాష తెలుగు. భారత దేశంలో తెలుగు మాతృభాషగా మాట్లాడే 8.7 కోట్ల (2001) జనాభాతో [1] ప్రాంతీయ భాషలలో మొదటి స్థానంలో ఉంది. ప్రపంచంలోని ప్రజలు అత్యధికముగా మాట్లాడే భాషలలో 15 స్థానములోనూ, భారత దేశములో హిందీ, తర్వాత స్థానములోనూ నిలుస్తుంది. పాతవైన ప్రపంచ భాష గణాంకాల (ఎథ్నోలాగ్) ప్రకారం ప్రపంచవ్యాప్తంగా 7.4 కోట్లు మందికి మాతృభాషగా ఉంది.[2] మొదటి భాషగా మాట్లాడతారు. అతి ప్రాచీన దేశ భాషలలో సంస్కృతము తమిళముతో బాటు తెలుగు భాషను 2008 అక్టోబరు 31న భారత ప్రభుత్వము గుర్తించింది.'),
|
||||||
|
('ur', 'اُردُو لشکری زبان[8] (یا جدید معیاری اردو) برصغیر کی معیاری زبانوں میں سے ایک ہے۔ یہ پاکستان کی قومی اور رابطہ عامہ کی زبان ہے، جبکہ بھارت کی چھے ریاستوں کی دفتری زبان کا درجہ رکھتی ہے۔ آئین ہند کے مطابق اسے 22 دفتری شناخت زبانوں میں شامل کیا جاچکا ہے۔ 2001ء کی مردم شماری کے مطابق اردو کو بطور مادری زبان بھارت میں 5.01% فیصد لوگ بولتے ہیں اور اس لحاظ سے یہ بھارت کی چھٹی بڑی زبان ہے جبکہ پاکستان میں اسے بطور مادری زبان 7.59% فیصد لوگ استعمال کرتے ہیں، یہ پاکستان کی پانچویں بڑی زبان ہے۔ اردو تاریخی طور پر ہندوستان کی مسلم آبادی سے جڑی ہے۔[حوالہ درکار] بعض ذخیرہ الفاظ کے علاوہ یہ زبان معیاری ہندی سے قابل فہم ہے جو اس خطے کی ہندوؤں سے منسوب ہے۔[حوالہ درکار] زبانِ اردو کو پہچان و ترقی اس وقت ملی جب برطانوی دور میں انگریز حکمرانوں نے اسے فارسی کی بجائے انگریزی کے ساتھ شمالی ہندوستان کے علاقوں اور جموں و کشمیر میں اسے سنہ 1846ء اور پنجاب میں سنہ 1849ء میں بطور دفتری زبان نافذ کیا۔ اس کے علاوہ خلیجی، یورپی، ایشیائی اور امریکی علاقوں میں اردو بولنے والوں کی ایک بڑی تعداد آباد ہے جو بنیادی طور پر جنوبی ایشیاء سے کوچ کرنے والے اہلِ اردو ہیں۔ 1999ء کے اعداد وشمار کے مطابق اردو زبان کے مجموعی متکلمین کی تعداد دس کروڑ ساٹھ لاکھ کے لگ بھگ تھی۔ اس لحاظ سے یہ دنیا کی نویں بڑی زبان ہے۔'),
|
||||||
|
],
|
||||||
|
# fmt: on
|
||||||
|
)
|
||||||
|
def test_sentencizer_across_scripts(lang, text):
|
||||||
|
nlp = spacy.blank(lang)
|
||||||
|
sentencizer = Sentencizer()
|
||||||
|
nlp.add_pipe(sentencizer)
|
||||||
|
doc = nlp(text)
|
||||||
|
assert len(list(doc.sents)) > 1
|
||||||
|
|
|
@ -1,11 +1,16 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import itertools
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.vocab import Vocab
|
from spacy.compat import is_python2
|
||||||
from spacy.language import Language
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
from spacy.gold import GoldParse
|
from spacy.gold import GoldParse
|
||||||
|
from spacy.language import Language
|
||||||
|
from spacy.tokens import Doc, Span
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
|
from .util import add_vecs_to_vocab, assert_docs_equal
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -58,3 +63,74 @@ def test_language_evaluate(nlp):
|
||||||
# Evaluate badly
|
# Evaluate badly
|
||||||
with pytest.raises(Exception):
|
with pytest.raises(Exception):
|
||||||
nlp.evaluate([text, gold])
|
nlp.evaluate([text, gold])
|
||||||
|
|
||||||
|
|
||||||
|
def vector_modification_pipe(doc):
|
||||||
|
doc.vector += 1
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
def userdata_pipe(doc):
|
||||||
|
doc.user_data["foo"] = "bar"
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
def ner_pipe(doc):
|
||||||
|
span = Span(doc, 0, 1, label="FIRST")
|
||||||
|
doc.ents += (span,)
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_vectors():
|
||||||
|
return [
|
||||||
|
("spacy", [-0.1, -0.2, -0.3]),
|
||||||
|
("world", [-0.2, -0.3, -0.4]),
|
||||||
|
("pipe", [0.7, 0.8, 0.9]),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def nlp2(nlp, sample_vectors):
|
||||||
|
add_vecs_to_vocab(nlp.vocab, sample_vectors)
|
||||||
|
nlp.add_pipe(vector_modification_pipe)
|
||||||
|
nlp.add_pipe(ner_pipe)
|
||||||
|
nlp.add_pipe(userdata_pipe)
|
||||||
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def texts():
|
||||||
|
data = [
|
||||||
|
"Hello world.",
|
||||||
|
"This is spacy.",
|
||||||
|
"You can use multiprocessing with pipe method.",
|
||||||
|
"Please try!",
|
||||||
|
]
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("n_process", [1, 2])
|
||||||
|
def test_language_pipe(nlp2, n_process, texts):
|
||||||
|
texts = texts * 10
|
||||||
|
expecteds = [nlp2(text) for text in texts]
|
||||||
|
docs = nlp2.pipe(texts, n_process=n_process, batch_size=2)
|
||||||
|
|
||||||
|
for doc, expected_doc in zip(docs, expecteds):
|
||||||
|
assert_docs_equal(doc, expected_doc)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
is_python2, reason="python2 seems to be unable to handle iterator properly"
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("n_process", [1, 2])
|
||||||
|
def test_language_pipe_stream(nlp2, n_process, texts):
|
||||||
|
# check if nlp.pipe can handle infinite length iterator properly.
|
||||||
|
stream_texts = itertools.cycle(texts)
|
||||||
|
texts0, texts1 = itertools.tee(stream_texts)
|
||||||
|
expecteds = (nlp2(text) for text in texts0)
|
||||||
|
docs = nlp2.pipe(texts1, n_process=n_process, batch_size=2)
|
||||||
|
|
||||||
|
n_fetch = 20
|
||||||
|
for doc, expected_doc in itertools.islice(zip(docs, expecteds), n_fetch):
|
||||||
|
assert_docs_equal(doc, expected_doc)
|
||||||
|
|
19
spacy/tests/test_register_architecture.py
Normal file
19
spacy/tests/test_register_architecture.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from spacy import register_architecture
|
||||||
|
from spacy import get_architecture
|
||||||
|
from thinc.v2v import Affine
|
||||||
|
|
||||||
|
|
||||||
|
@register_architecture("my_test_function")
|
||||||
|
def create_model(nr_in, nr_out):
|
||||||
|
return Affine(nr_in, nr_out)
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_architecture():
|
||||||
|
arch = get_architecture("my_test_function")
|
||||||
|
assert arch is create_model
|
||||||
|
with pytest.raises(KeyError):
|
||||||
|
get_architecture("not_an_existing_key")
|
|
@ -2,7 +2,6 @@
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import pkg_resources
|
|
||||||
import importlib
|
import importlib
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -28,15 +27,21 @@ except ImportError:
|
||||||
|
|
||||||
from .symbols import ORTH
|
from .symbols import ORTH
|
||||||
from .compat import cupy, CudaStream, path2str, basestring_, unicode_
|
from .compat import cupy, CudaStream, path2str, basestring_, unicode_
|
||||||
from .compat import import_file
|
from .compat import import_file, importlib_metadata
|
||||||
from .errors import Errors, Warnings, deprecation_warning
|
from .errors import Errors, Warnings, deprecation_warning
|
||||||
|
|
||||||
|
|
||||||
LANGUAGES = {}
|
LANGUAGES = {}
|
||||||
|
ARCHITECTURES = {}
|
||||||
_data_path = Path(__file__).parent / "data"
|
_data_path = Path(__file__).parent / "data"
|
||||||
_PRINT_ENV = False
|
_PRINT_ENV = False
|
||||||
|
|
||||||
|
|
||||||
|
# NB: Ony ever call this once! If called more than ince within the
|
||||||
|
# function, test_issue1506 hangs and it's not 100% clear why.
|
||||||
|
AVAILABLE_ENTRY_POINTS = importlib_metadata.entry_points()
|
||||||
|
|
||||||
|
|
||||||
class ENTRY_POINTS(object):
|
class ENTRY_POINTS(object):
|
||||||
"""Available entry points to register extensions."""
|
"""Available entry points to register extensions."""
|
||||||
|
|
||||||
|
@ -44,6 +49,7 @@ class ENTRY_POINTS(object):
|
||||||
languages = "spacy_languages"
|
languages = "spacy_languages"
|
||||||
displacy_colors = "spacy_displacy_colors"
|
displacy_colors = "spacy_displacy_colors"
|
||||||
lookups = "spacy_lookups"
|
lookups = "spacy_lookups"
|
||||||
|
architectures = "spacy_architectures"
|
||||||
|
|
||||||
|
|
||||||
def set_env_log(value):
|
def set_env_log(value):
|
||||||
|
@ -115,6 +121,44 @@ def set_data_path(path):
|
||||||
_data_path = ensure_path(path)
|
_data_path = ensure_path(path)
|
||||||
|
|
||||||
|
|
||||||
|
def register_architecture(name, arch=None):
|
||||||
|
"""Decorator to register an architecture. An architecture is a function
|
||||||
|
that returns a Thinc Model object.
|
||||||
|
|
||||||
|
name (unicode): The name of the architecture to register.
|
||||||
|
arch (Model): Optional architecture if function is called directly and
|
||||||
|
not used as a decorator.
|
||||||
|
RETURNS (callable): Function to register architecture.
|
||||||
|
"""
|
||||||
|
global ARCHITECTURES
|
||||||
|
if arch is not None:
|
||||||
|
ARCHITECTURES[name] = arch
|
||||||
|
return arch
|
||||||
|
|
||||||
|
def do_registration(arch):
|
||||||
|
ARCHITECTURES[name] = arch
|
||||||
|
return arch
|
||||||
|
|
||||||
|
return do_registration
|
||||||
|
|
||||||
|
|
||||||
|
def get_architecture(name):
|
||||||
|
"""Get a model architecture function by name. Raises a KeyError if the
|
||||||
|
architecture is not found.
|
||||||
|
|
||||||
|
name (unicode): The mame of the architecture.
|
||||||
|
RETURNS (Model): The architecture.
|
||||||
|
"""
|
||||||
|
# Check if an entry point is exposed for the architecture code
|
||||||
|
entry_point = get_entry_point(ENTRY_POINTS.architectures, name)
|
||||||
|
if entry_point is not None:
|
||||||
|
ARCHITECTURES[name] = entry_point
|
||||||
|
if name not in ARCHITECTURES:
|
||||||
|
names = ", ".join(sorted(ARCHITECTURES.keys()))
|
||||||
|
raise KeyError(Errors.E174.format(name=name, names=names))
|
||||||
|
return ARCHITECTURES[name]
|
||||||
|
|
||||||
|
|
||||||
def ensure_path(path):
|
def ensure_path(path):
|
||||||
"""Ensure string is converted to a Path.
|
"""Ensure string is converted to a Path.
|
||||||
|
|
||||||
|
@ -253,6 +297,8 @@ def is_package(name):
|
||||||
name (unicode): Name of package.
|
name (unicode): Name of package.
|
||||||
RETURNS (bool): True if installed package, False if not.
|
RETURNS (bool): True if installed package, False if not.
|
||||||
"""
|
"""
|
||||||
|
import pkg_resources
|
||||||
|
|
||||||
name = name.lower() # compare package name against lowercase name
|
name = name.lower() # compare package name against lowercase name
|
||||||
packages = pkg_resources.working_set.by_key.keys()
|
packages = pkg_resources.working_set.by_key.keys()
|
||||||
for package in packages:
|
for package in packages:
|
||||||
|
@ -282,7 +328,7 @@ def get_entry_points(key):
|
||||||
RETURNS (dict): Entry points, keyed by name.
|
RETURNS (dict): Entry points, keyed by name.
|
||||||
"""
|
"""
|
||||||
result = {}
|
result = {}
|
||||||
for entry_point in pkg_resources.iter_entry_points(key):
|
for entry_point in AVAILABLE_ENTRY_POINTS.get(key, []):
|
||||||
result[entry_point.name] = entry_point.load()
|
result[entry_point.name] = entry_point.load()
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
@ -296,7 +342,7 @@ def get_entry_point(key, value, default=None):
|
||||||
default: Optional default value to return.
|
default: Optional default value to return.
|
||||||
RETURNS: The loaded entry point or None.
|
RETURNS: The loaded entry point or None.
|
||||||
"""
|
"""
|
||||||
for entry_point in pkg_resources.iter_entry_points(key):
|
for entry_point in AVAILABLE_ENTRY_POINTS.get(key, []):
|
||||||
if entry_point.name == value:
|
if entry_point.name == value:
|
||||||
return entry_point.load()
|
return entry_point.load()
|
||||||
return default
|
return default
|
||||||
|
|
|
@ -337,7 +337,7 @@ cdef class Vectors:
|
||||||
scores[i:i+batch_size] = xp.partition(sims, -n, axis=1)[:,-n:]
|
scores[i:i+batch_size] = xp.partition(sims, -n, axis=1)[:,-n:]
|
||||||
|
|
||||||
if sort:
|
if sort:
|
||||||
sorted_index = xp.arange(scores.shape[0])[:,None],xp.argsort(scores, axis=1)[:,::-1]
|
sorted_index = xp.arange(scores.shape[0])[:,None],xp.argsort(scores[i:i+batch_size], axis=1)[:,::-1]
|
||||||
scores[i:i+batch_size] = scores[sorted_index]
|
scores[i:i+batch_size] = scores[sorted_index]
|
||||||
best_rows[i:i+batch_size] = best_rows[sorted_index]
|
best_rows[i:i+batch_size] = best_rows[sorted_index]
|
||||||
|
|
||||||
|
|
|
@ -8,10 +8,10 @@
|
||||||
"en_core_web_md",
|
"en_core_web_md",
|
||||||
"en_core_web_lg",
|
"en_core_web_lg",
|
||||||
"en_vectors_web_lg",
|
"en_vectors_web_lg",
|
||||||
"en_pytt_bertbaseuncased_lg",
|
"en_trf_bertbaseuncased_lg",
|
||||||
"en_pytt_robertabase_lg",
|
"en_trf_robertabase_lg",
|
||||||
"en_pytt_distilbertbaseuncased_lg",
|
"en_trf_distilbertbaseuncased_lg",
|
||||||
"en_pytt_xlnetbasecased_lg"
|
"en_trf_xlnetbasecased_lg"
|
||||||
],
|
],
|
||||||
"example": "This is a sentence.",
|
"example": "This is a sentence.",
|
||||||
"has_examples": true
|
"has_examples": true
|
||||||
|
@ -19,7 +19,7 @@
|
||||||
{
|
{
|
||||||
"code": "de",
|
"code": "de",
|
||||||
"name": "German",
|
"name": "German",
|
||||||
"models": ["de_core_news_sm", "de_core_news_md", "de_pytt_bertbasecased_lg"],
|
"models": ["de_core_news_sm", "de_core_news_md", "de_trf_bertbasecased_lg"],
|
||||||
"example": "Dies ist ein Satz.",
|
"example": "Dies ist ein Satz.",
|
||||||
"has_examples": true
|
"has_examples": true
|
||||||
},
|
},
|
||||||
|
|
|
@ -1675,21 +1675,21 @@
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": "spacy-pytorch-transformers",
|
"id": "spacy-transformers",
|
||||||
"title": "spacy-pytorch-transformers",
|
"title": "spacy-transformers",
|
||||||
"slogan": "spaCy pipelines for pretrained BERT, XLNet and GPT-2",
|
"slogan": "spaCy pipelines for pretrained BERT, XLNet and GPT-2",
|
||||||
"description": "This package provides spaCy model pipelines that wrap [Hugging Face's `pytorch-transformers`](https://github.com/huggingface/pytorch-transformers) package, so you can use them in spaCy. The result is convenient access to state-of-the-art transformer architectures, such as BERT, GPT-2, XLNet, etc.",
|
"description": "This package provides spaCy model pipelines that wrap [Hugging Face's `transformers`](https://github.com/huggingface/transformers) package, so you can use them in spaCy. The result is convenient access to state-of-the-art transformer architectures, such as BERT, GPT-2, XLNet, etc.",
|
||||||
"github": "explosion/spacy-pytorch-transformers",
|
"github": "explosion/spacy-transformers",
|
||||||
"url": "https://explosion.ai/blog/spacy-pytorch-transformers",
|
"url": "https://explosion.ai/blog/spacy-transformers",
|
||||||
"pip": "spacy-pytorch-transformers",
|
"pip": "spacy-transformers",
|
||||||
"category": ["pipeline", "models", "research"],
|
"category": ["pipeline", "models", "research"],
|
||||||
"code_example": [
|
"code_example": [
|
||||||
"import spacy",
|
"import spacy",
|
||||||
"",
|
"",
|
||||||
"nlp = spacy.load(\"en_pytt_bertbaseuncased_lg\")",
|
"nlp = spacy.load(\"en_trf_bertbaseuncased_lg\")",
|
||||||
"doc = nlp(\"Apple shares rose on the news. Apple pie is delicious.\")",
|
"doc = nlp(\"Apple shares rose on the news. Apple pie is delicious.\")",
|
||||||
"print(doc[0].similarity(doc[7]))",
|
"print(doc[0].similarity(doc[7]))",
|
||||||
"print(doc._.pytt_last_hidden_state.shape)"
|
"print(doc._.trf_last_hidden_state.shape)"
|
||||||
],
|
],
|
||||||
"author": "Explosion",
|
"author": "Explosion",
|
||||||
"author_links": {
|
"author_links": {
|
||||||
|
|
|
@ -23,6 +23,7 @@ const MODEL_META = {
|
||||||
dep: 'Vocabulary, syntax',
|
dep: 'Vocabulary, syntax',
|
||||||
ent: 'Named entities',
|
ent: 'Named entities',
|
||||||
pytt: 'PyTorch Transformers',
|
pytt: 'PyTorch Transformers',
|
||||||
|
trf: 'Transformers',
|
||||||
vectors: 'Word vectors',
|
vectors: 'Word vectors',
|
||||||
web: 'written text (blogs, news, comments)',
|
web: 'written text (blogs, news, comments)',
|
||||||
news: 'written text (news, media)',
|
news: 'written text (news, media)',
|
||||||
|
|
Loading…
Reference in New Issue
Block a user