mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-05 23:06:28 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
e8f284c741
|
@ -48,4 +48,6 @@ redirects = [
|
|||
{from = "/api/sentencesegmenter", to="/api/sentencizer"},
|
||||
{from = "/universe", to = "/universe/project/:id", query = {id = ":id"}, force = true},
|
||||
{from = "/universe", to = "/universe/category/:category", query = {category = ":category"}, force = true},
|
||||
# Renamed universe projects
|
||||
{from = "/universe/project/spacy-pytorch-transformers", to = "/universe/project/spacy-transformers", force = true}
|
||||
]
|
||||
|
|
|
@ -11,6 +11,7 @@ numpy>=1.15.0
|
|||
requests>=2.13.0,<3.0.0
|
||||
plac<1.0.0,>=0.9.6
|
||||
pathlib==1.0.1; python_version < "3.4"
|
||||
importlib_metadata>=0.23; python_version < "3.8"
|
||||
# Optional dependencies
|
||||
jsonschema>=2.6.0,<3.1.0
|
||||
# Development dependencies
|
||||
|
|
|
@ -39,6 +39,7 @@ setup_requires =
|
|||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=7.1.1,<7.2.0
|
||||
install_requires =
|
||||
setuptools
|
||||
numpy>=1.15.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
|
@ -50,6 +51,7 @@ install_requires =
|
|||
wasabi>=0.2.0,<1.1.0
|
||||
srsly>=0.1.0,<1.1.0
|
||||
pathlib==1.0.1; python_version < "3.4"
|
||||
importlib_metadata>=0.23; python_version < "3.8"
|
||||
|
||||
[options.extras_require]
|
||||
lookups =
|
||||
|
|
|
@ -14,6 +14,7 @@ from .glossary import explain
|
|||
from .about import __version__
|
||||
from .errors import Errors, Warnings, deprecation_warning
|
||||
from . import util
|
||||
from .util import register_architecture, get_architecture
|
||||
|
||||
|
||||
if sys.maxunicode == 65535:
|
||||
|
|
14
spacy/_ml.py
14
spacy/_ml.py
|
@ -953,16 +953,24 @@ class CharacterEmbed(Model):
|
|||
return output, backprop_character_embed
|
||||
|
||||
|
||||
def get_cossim_loss(yh, y):
|
||||
def get_cossim_loss(yh, y, ignore_zeros=False):
|
||||
xp = get_array_module(yh)
|
||||
# Find the zero vectors
|
||||
if ignore_zeros:
|
||||
zero_indices = xp.abs(y).sum(axis=1) == 0
|
||||
# Add a small constant to avoid 0 vectors
|
||||
yh = yh + 1e-8
|
||||
y = y + 1e-8
|
||||
# https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity
|
||||
xp = get_array_module(yh)
|
||||
norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True)
|
||||
norm_y = xp.linalg.norm(y, axis=1, keepdims=True)
|
||||
mul_norms = norm_yh * norm_y
|
||||
cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms
|
||||
d_yh = (y / mul_norms) - (cosine * (yh / norm_yh ** 2))
|
||||
loss = xp.abs(cosine - 1).sum()
|
||||
losses = xp.abs(cosine - 1)
|
||||
if ignore_zeros:
|
||||
# If the target was a zero vector, don't count it in the loss.
|
||||
d_yh[zero_indices] = 0
|
||||
losses[zero_indices] = 0
|
||||
loss = losses.sum()
|
||||
return loss, -d_yh
|
||||
|
|
|
@ -7,7 +7,7 @@ from ...gold import docs_to_json
|
|||
from ...util import get_lang_class, minibatch
|
||||
|
||||
|
||||
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
|
||||
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_):
|
||||
if lang is None:
|
||||
raise ValueError("No --lang specified, but tokenization required")
|
||||
json_docs = []
|
||||
|
|
|
@ -6,7 +6,6 @@ import requests
|
|||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import pkg_resources
|
||||
from wasabi import Printer
|
||||
|
||||
from .link import link
|
||||
|
@ -87,6 +86,8 @@ def download(model, direct=False, *pip_args):
|
|||
|
||||
def require_package(name):
|
||||
try:
|
||||
import pkg_resources
|
||||
|
||||
pkg_resources.working_set.require(name)
|
||||
return True
|
||||
except: # noqa: E722
|
||||
|
|
|
@ -35,6 +35,7 @@ from .train import _load_pretrained_tok2vec
|
|||
output_dir=("Directory to write models to on each epoch", "positional", None, str),
|
||||
width=("Width of CNN layers", "option", "cw", int),
|
||||
depth=("Depth of CNN layers", "option", "cd", int),
|
||||
bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int),
|
||||
embed_rows=("Number of embedding rows", "option", "er", int),
|
||||
loss_func=(
|
||||
"Loss function to use for the objective. Either 'L2' or 'cosine'",
|
||||
|
@ -80,6 +81,7 @@ def pretrain(
|
|||
output_dir,
|
||||
width=96,
|
||||
depth=4,
|
||||
bilstm_depth=2,
|
||||
embed_rows=2000,
|
||||
loss_func="cosine",
|
||||
use_vectors=False,
|
||||
|
@ -116,6 +118,10 @@ def pretrain(
|
|||
util.fix_random_seed(seed)
|
||||
|
||||
has_gpu = prefer_gpu()
|
||||
if has_gpu:
|
||||
import torch
|
||||
|
||||
torch.set_default_tensor_type("torch.cuda.FloatTensor")
|
||||
msg.info("Using GPU" if has_gpu else "Not using GPU")
|
||||
|
||||
output_dir = Path(output_dir)
|
||||
|
@ -151,7 +157,7 @@ def pretrain(
|
|||
embed_rows,
|
||||
conv_depth=depth,
|
||||
pretrained_vectors=pretrained_vectors,
|
||||
bilstm_depth=0, # Requires PyTorch. Experimental.
|
||||
bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental.
|
||||
cnn_maxout_pieces=3, # You can try setting this higher
|
||||
subword_features=True, # Set to False for Chinese etc
|
||||
),
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import pkg_resources
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import requests
|
||||
|
@ -109,6 +108,8 @@ def get_model_links(compat):
|
|||
|
||||
|
||||
def get_model_pkgs(compat, all_models):
|
||||
import pkg_resources
|
||||
|
||||
pkgs = {}
|
||||
for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
|
||||
package = pkg_name.replace("-", "_")
|
||||
|
|
|
@ -35,6 +35,11 @@ try:
|
|||
except ImportError:
|
||||
cupy = None
|
||||
|
||||
try: # Python 3.8
|
||||
import importlib.metadata as importlib_metadata
|
||||
except ImportError:
|
||||
import importlib_metadata # noqa: F401
|
||||
|
||||
try:
|
||||
from thinc.neural.optimizers import Optimizer # noqa: F401
|
||||
except ImportError:
|
||||
|
|
|
@ -95,6 +95,7 @@ class Warnings(object):
|
|||
"you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. "
|
||||
"If this is surprising, make sure you have the spacy-lookups-data "
|
||||
"package installed.")
|
||||
W023 = ("Multiprocessing of Language.pipe is not supported in Python2. 'n_process' will be set to 1.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
@ -495,6 +496,8 @@ class Errors(object):
|
|||
E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of "
|
||||
"Lookups containing the lemmatization tables. See the docs for "
|
||||
"details: https://spacy.io/api/lemmatizer#init")
|
||||
E174 = ("Architecture '{name}' not found in registry. Available "
|
||||
"names: {names}")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
|
@ -184,7 +184,7 @@ _russian_lower = r"ёа-я"
|
|||
_russian_upper = r"ЁА-Я"
|
||||
_russian = r"ёа-яЁА-Я"
|
||||
|
||||
_sinhala = r"\u0D85-\u0D96\u0D9A-\u0DB1\u0DB3-\u0DBB\u0DBD\u0DC0-\u0DC6"
|
||||
_sinhala = r"\u0D80-\u0DFF"
|
||||
|
||||
_tatar_lower = r"әөүҗңһ"
|
||||
_tatar_upper = r"ӘӨҮҖҢҺ"
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
# coding: utf8
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
|
||||
import atexit
|
||||
import random
|
||||
import itertools
|
||||
from warnings import warn
|
||||
from spacy.util import minibatch
|
||||
import weakref
|
||||
import functools
|
||||
from collections import OrderedDict
|
||||
|
@ -10,6 +13,8 @@ from contextlib import contextmanager
|
|||
from copy import copy, deepcopy
|
||||
from thinc.neural import Model
|
||||
import srsly
|
||||
import multiprocessing as mp
|
||||
from itertools import chain, cycle
|
||||
|
||||
from .tokenizer import Tokenizer
|
||||
from .vocab import Vocab
|
||||
|
@ -21,7 +26,7 @@ from .pipeline import SimilarityHook, TextCategorizer, Sentencizer
|
|||
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
|
||||
from .pipeline import EntityRuler
|
||||
from .pipeline import Morphologizer
|
||||
from .compat import izip, basestring_
|
||||
from .compat import izip, basestring_, is_python2
|
||||
from .gold import GoldParse
|
||||
from .scorer import Scorer
|
||||
from ._ml import link_vectors_to_models, create_default_optimizer
|
||||
|
@ -30,8 +35,9 @@ from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
|||
from .lang.punctuation import TOKENIZER_INFIXES
|
||||
from .lang.tokenizer_exceptions import TOKEN_MATCH
|
||||
from .lang.tag_map import TAG_MAP
|
||||
from .tokens import Doc
|
||||
from .lang.lex_attrs import LEX_ATTRS, is_stop
|
||||
from .errors import Errors, Warnings, deprecation_warning
|
||||
from .errors import Errors, Warnings, deprecation_warning, user_warning
|
||||
from . import util
|
||||
from . import about
|
||||
|
||||
|
@ -733,6 +739,7 @@ class Language(object):
|
|||
disable=[],
|
||||
cleanup=False,
|
||||
component_cfg=None,
|
||||
n_process=1,
|
||||
):
|
||||
"""Process texts as a stream, and yield `Doc` objects in order.
|
||||
|
||||
|
@ -746,12 +753,20 @@ class Language(object):
|
|||
use. Experimental.
|
||||
component_cfg (dict): An optional dictionary with extra keyword
|
||||
arguments for specific components.
|
||||
n_process (int): Number of processors to process texts, only supported in Python3. If -1, set `multiprocessing.cpu_count()`.
|
||||
YIELDS (Doc): Documents in the order of the original text.
|
||||
|
||||
DOCS: https://spacy.io/api/language#pipe
|
||||
"""
|
||||
# raw_texts will be used later to stop iterator.
|
||||
texts, raw_texts = itertools.tee(texts)
|
||||
if is_python2 and n_process != 1:
|
||||
user_warning(Warnings.W023)
|
||||
n_process = 1
|
||||
if n_threads != -1:
|
||||
deprecation_warning(Warnings.W016)
|
||||
if n_process == -1:
|
||||
n_process = mp.cpu_count()
|
||||
if as_tuples:
|
||||
text_context1, text_context2 = itertools.tee(texts)
|
||||
texts = (tc[0] for tc in text_context1)
|
||||
|
@ -765,9 +780,12 @@ class Language(object):
|
|||
for doc, context in izip(docs, contexts):
|
||||
yield (doc, context)
|
||||
return
|
||||
docs = (self.make_doc(text) for text in texts)
|
||||
if component_cfg is None:
|
||||
component_cfg = {}
|
||||
|
||||
pipes = (
|
||||
[]
|
||||
) # contains functools.partial objects so that easily create multiprocess worker.
|
||||
for name, proc in self.pipeline:
|
||||
if name in disable:
|
||||
continue
|
||||
|
@ -775,10 +793,20 @@ class Language(object):
|
|||
# Allow component_cfg to overwrite the top-level kwargs.
|
||||
kwargs.setdefault("batch_size", batch_size)
|
||||
if hasattr(proc, "pipe"):
|
||||
docs = proc.pipe(docs, **kwargs)
|
||||
f = functools.partial(proc.pipe, **kwargs)
|
||||
else:
|
||||
# Apply the function, but yield the doc
|
||||
docs = _pipe(proc, docs, kwargs)
|
||||
f = functools.partial(_pipe, proc=proc, kwargs=kwargs)
|
||||
pipes.append(f)
|
||||
|
||||
if n_process != 1:
|
||||
docs = self._multiprocessing_pipe(texts, pipes, n_process, batch_size)
|
||||
else:
|
||||
# if n_process == 1, no processes are forked.
|
||||
docs = (self.make_doc(text) for text in texts)
|
||||
for pipe in pipes:
|
||||
docs = pipe(docs)
|
||||
|
||||
# Track weakrefs of "recent" documents, so that we can see when they
|
||||
# expire from memory. When they do, we know we don't need old strings.
|
||||
# This way, we avoid maintaining an unbounded growth in string entries
|
||||
|
@ -809,6 +837,46 @@ class Language(object):
|
|||
self.tokenizer._reset_cache(keys)
|
||||
nr_seen = 0
|
||||
|
||||
def _multiprocessing_pipe(self, texts, pipes, n_process, batch_size):
|
||||
# raw_texts is used later to stop iteration.
|
||||
texts, raw_texts = itertools.tee(texts)
|
||||
# for sending texts to worker
|
||||
texts_q = [mp.Queue() for _ in range(n_process)]
|
||||
# for receiving byte encoded docs from worker
|
||||
bytedocs_recv_ch, bytedocs_send_ch = zip(
|
||||
*[mp.Pipe(False) for _ in range(n_process)]
|
||||
)
|
||||
|
||||
batch_texts = minibatch(texts, batch_size)
|
||||
# Sender sends texts to the workers.
|
||||
# This is necessary to properly handle infinite length of texts.
|
||||
# (In this case, all data cannot be sent to the workers at once)
|
||||
sender = _Sender(batch_texts, texts_q, chunk_size=n_process)
|
||||
# send twice so that make process busy
|
||||
sender.send()
|
||||
sender.send()
|
||||
|
||||
procs = [
|
||||
mp.Process(target=_apply_pipes, args=(self.make_doc, pipes, rch, sch))
|
||||
for rch, sch in zip(texts_q, bytedocs_send_ch)
|
||||
]
|
||||
for proc in procs:
|
||||
proc.start()
|
||||
|
||||
# Cycle channels not to break the order of docs.
|
||||
# The received object is batch of byte encoded docs, so flatten them with chain.from_iterable.
|
||||
byte_docs = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch))
|
||||
docs = (Doc(self.vocab).from_bytes(byte_doc) for byte_doc in byte_docs)
|
||||
try:
|
||||
for i, (_, doc) in enumerate(zip(raw_texts, docs), 1):
|
||||
yield doc
|
||||
if i % batch_size == 0:
|
||||
# tell `sender` that one batch was consumed.
|
||||
sender.step()
|
||||
finally:
|
||||
for proc in procs:
|
||||
proc.terminate()
|
||||
|
||||
def to_disk(self, path, exclude=tuple(), disable=None):
|
||||
"""Save the current state to a directory. If a model is loaded, this
|
||||
will include the model.
|
||||
|
@ -987,12 +1055,55 @@ class DisabledPipes(list):
|
|||
self[:] = []
|
||||
|
||||
|
||||
def _pipe(func, docs, kwargs):
|
||||
def _pipe(docs, proc, kwargs):
|
||||
# We added some args for pipe that __call__ doesn't expect.
|
||||
kwargs = dict(kwargs)
|
||||
for arg in ["n_threads", "batch_size"]:
|
||||
if arg in kwargs:
|
||||
kwargs.pop(arg)
|
||||
for doc in docs:
|
||||
doc = func(doc, **kwargs)
|
||||
doc = proc(doc, **kwargs)
|
||||
yield doc
|
||||
|
||||
|
||||
def _apply_pipes(make_doc, pipes, reciever, sender):
|
||||
"""Worker for Language.pipe
|
||||
|
||||
Args:
|
||||
receiver (multiprocessing.Connection): Pipe to receive text. Usually created by `multiprocessing.Pipe()`
|
||||
sender (multiprocessing.Connection): Pipe to send doc. Usually created by `multiprocessing.Pipe()`
|
||||
"""
|
||||
while True:
|
||||
texts = reciever.get()
|
||||
docs = (make_doc(text) for text in texts)
|
||||
for pipe in pipes:
|
||||
docs = pipe(docs)
|
||||
# Connection does not accept unpickable objects, so send list.
|
||||
sender.send([doc.to_bytes() for doc in docs])
|
||||
|
||||
|
||||
class _Sender:
|
||||
"""Util for sending data to multiprocessing workers in Language.pipe"""
|
||||
|
||||
def __init__(self, data, queues, chunk_size):
|
||||
self.data = iter(data)
|
||||
self.queues = iter(cycle(queues))
|
||||
self.chunk_size = chunk_size
|
||||
self.count = 0
|
||||
|
||||
def send(self):
|
||||
"""Send chunk_size items from self.data to channels."""
|
||||
for item, q in itertools.islice(
|
||||
zip(self.data, cycle(self.queues)), self.chunk_size
|
||||
):
|
||||
# cycle channels so that distribute the texts evenly
|
||||
q.put(item)
|
||||
|
||||
def step(self):
|
||||
"""Tell sender that comsumed one item.
|
||||
|
||||
Data is sent to the workers after every chunk_size calls."""
|
||||
self.count += 1
|
||||
if self.count >= self.chunk_size:
|
||||
self.count = 0
|
||||
self.send()
|
||||
|
|
|
@ -225,7 +225,7 @@ cdef class PhraseMatcher:
|
|||
for i in range(c_matches.size()):
|
||||
matches.append((c_matches[i].match_id, c_matches[i].start, c_matches[i].end))
|
||||
for i, (ent_id, start, end) in enumerate(matches):
|
||||
on_match = self._callbacks.get(ent_id)
|
||||
on_match = self._callbacks.get(self.vocab.strings[ent_id])
|
||||
if on_match is not None:
|
||||
on_match(self, doc, i, matches)
|
||||
return matches
|
||||
|
|
|
@ -29,7 +29,7 @@ from .._ml import Tok2Vec, build_tagger_model, cosine, get_cossim_loss
|
|||
from .._ml import build_text_classifier, build_simple_cnn_text_classifier
|
||||
from .._ml import build_bow_text_classifier, build_nel_encoder
|
||||
from .._ml import link_vectors_to_models, zero_init, flatten
|
||||
from .._ml import masked_language_model, create_default_optimizer
|
||||
from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss
|
||||
from ..errors import Errors, TempErrors, user_warning, Warnings
|
||||
from .. import util
|
||||
|
||||
|
@ -880,8 +880,7 @@ class ClozeMultitask(Pipe):
|
|||
# and look them up all at once. This prevents data copying.
|
||||
ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||
target = vectors[ids]
|
||||
gradient = (prediction - target) / prediction.shape[0]
|
||||
loss = (gradient**2).sum()
|
||||
loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True)
|
||||
return float(loss), gradient
|
||||
|
||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||
|
|
|
@ -3,6 +3,7 @@ from __future__ import unicode_literals
|
|||
|
||||
import pytest
|
||||
import re
|
||||
from mock import Mock
|
||||
from spacy.matcher import Matcher, DependencyMatcher
|
||||
from spacy.tokens import Doc, Token
|
||||
|
||||
|
@ -418,3 +419,13 @@ def test_matcher_valid_callback(en_vocab):
|
|||
with pytest.raises(ValueError):
|
||||
matcher.add("TEST", [], [{"TEXT": "test"}])
|
||||
matcher(Doc(en_vocab, words=["test"]))
|
||||
|
||||
|
||||
def test_matcher_callback(en_vocab):
|
||||
mock = Mock()
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"ORTH": "test"}]
|
||||
matcher.add("Rule", mock, pattern)
|
||||
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
|
||||
matches = matcher(doc)
|
||||
mock.assert_called_once_with(matcher, doc, 0, matches)
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from mock import Mock
|
||||
from spacy.matcher import PhraseMatcher
|
||||
from spacy.tokens import Doc
|
||||
from ..util import get_doc
|
||||
|
@ -215,3 +216,13 @@ def test_attr_pipeline_checks(en_vocab):
|
|||
matcher.add("TEST3", None, doc3)
|
||||
matcher = PhraseMatcher(en_vocab, attr="TEXT")
|
||||
matcher.add("TEST3", None, doc3)
|
||||
|
||||
|
||||
def test_phrase_matcher_callback(en_vocab):
|
||||
mock = Mock()
|
||||
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
|
||||
pattern = Doc(en_vocab, words=["Google", "Now"])
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
matcher.add("COMPANY", mock, pattern)
|
||||
matches = matcher(doc)
|
||||
mock.assert_called_once_with(matcher, doc, 0, matches)
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
import spacy
|
||||
from spacy.pipeline import Sentencizer
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
@ -85,3 +86,26 @@ def test_sentencizer_serialize_bytes(en_vocab):
|
|||
bytes_data = sentencizer.to_bytes()
|
||||
new_sentencizer = Sentencizer().from_bytes(bytes_data)
|
||||
assert new_sentencizer.punct_chars == set(punct_chars)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
# fmt: off
|
||||
"lang,text",
|
||||
[
|
||||
('bn', 'বাংলা ভাষা (বাঙলা, বাঙ্গলা, তথা বাঙ্গালা নামগুলোতেও পরিচিত) একটি ইন্দো-আর্য ভাষা, যা দক্ষিণ এশিয়ার বাঙালি জাতির প্রধান কথ্য ও লেখ্য ভাষা। মাতৃভাষীর সংখ্যায় বাংলা ইন্দো-ইউরোপীয় ভাষা পরিবারের চতুর্থ ও বিশ্বের ষষ্ঠ বৃহত্তম ভাষা।[৫] মোট ব্যবহারকারীর সংখ্যা অনুসারে বাংলা বিশ্বের সপ্তম বৃহত্তম ভাষা। বাংলা সার্বভৌম ভাষাভিত্তিক জাতিরাষ্ট্র বাংলাদেশের একমাত্র রাষ্ট্রভাষা তথা সরকারি ভাষা[৬] এবং ভারতের পশ্চিমবঙ্গ, ত্রিপুরা, আসামের বরাক উপত্যকার সরকারি ভাষা। বঙ্গোপসাগরে অবস্থিত আন্দামান দ্বীপপুঞ্জের প্রধান কথ্য ভাষা বাংলা। এছাড়া ভারতের ঝাড়খণ্ড, বিহার, মেঘালয়, মিজোরাম, উড়িষ্যা রাজ্যগুলোতে উল্লেখযোগ্য পরিমাণে বাংলাভাষী জনগণ রয়েছে। ভারতে হিন্দির পরেই সর্বাধিক প্রচলিত ভাষা বাংলা।[৭][৮] এছাড়াও মধ্য প্রাচ্য, আমেরিকা ও ইউরোপে উল্লেখযোগ্য পরিমাণে বাংলাভাষী অভিবাসী রয়েছে।[৯] সারা বিশ্বে সব মিলিয়ে ২৬ কোটির অধিক লোক দৈনন্দিন জীবনে বাংলা ব্যবহার করে।[২] বাংলাদেশের জাতীয় সঙ্গীত এবং ভারতের জাতীয় সঙ্গীত ও স্তোত্র বাংলাতে রচিত।'),
|
||||
('de', 'Die deutsche Sprache bzw. Deutsch ([dɔʏ̯t͡ʃ]; abgekürzt dt. oder dtsch.) ist eine westgermanische Sprache. Ihr Sprachraum umfasst Deutschland, Österreich, die Deutschschweiz, Liechtenstein, Luxemburg, Ostbelgien, Südtirol, das Elsass und Lothringen sowie Nordschleswig. Außerdem ist sie eine Minderheitensprache in einigen europäischen und außereuropäischen Ländern, z. B. in Rumänien und Südafrika, sowie Nationalsprache im afrikanischen Namibia.'),
|
||||
('hi', 'हिन्दी विश्व की एक प्रमुख भाषा है एवं भारत की राजभाषा है। केन्द्रीय स्तर पर भारत में दूसरी आधिकारिक भाषा अंग्रेजी है। यह हिंदुस्तानी भाषा की एक मानकीकृत रूप है जिसमें संस्कृत के तत्सम तथा तद्भव शब्दों का प्रयोग अधिक है और अरबी-फ़ारसी शब्द कम हैं। हिंदी संवैधानिक रूप से भारत की राजभाषा और भारत की सबसे अधिक बोली और समझी जाने वाली भाषा है। हालाँकि, हिन्दी भारत की राष्ट्रभाषा नहीं है,[3] क्योंकि भारत के संविधान में कोई भी भाषा को ऐसा दर्जा नहीं दिया गया था।[4][5] चीनी के बाद यह विश्व में सबसे अधिक बोली जाने वाली भाषा भी है। विश्व आर्थिक मंच की गणना के अनुसार यह विश्व की दस शक्तिशाली भाषाओं में से एक है।[6]'),
|
||||
('kn', 'ದ್ರಾವಿಡ ಭಾಷೆಗಳಲ್ಲಿ ಪ್ರಾಮುಖ್ಯವುಳ್ಳ ಭಾಷೆಯೂ ಭಾರತದ ಪುರಾತನವಾದ ಭಾಷೆಗಳಲ್ಲಿ ಒಂದೂ ಆಗಿರುವ ಕನ್ನಡ ಭಾಷೆಯನ್ನು ಅದರ ವಿವಿಧ ರೂಪಗಳಲ್ಲಿ ಸುಮಾರು ೪೫ ದಶಲಕ್ಷ ಜನರು ಆಡು ನುಡಿಯಾಗಿ ಬಳಸುತ್ತಲಿದ್ದಾರೆ. ಕನ್ನಡ ಕರ್ನಾಟಕ ರಾಜ್ಯದ ಆಡಳಿತ ಭಾಷೆ.[೧೧] ಜಗತ್ತಿನಲ್ಲಿ ಅತ್ಯಂತ ಹೆಚ್ಚು ಮಂದಿ ಮಾತನಾಡುವ ಭಾಷೆಯೆಂಬ ನೆಲೆಯಲ್ಲಿ ಇಪ್ಪತೊಂಬತ್ತನೆಯ ಸ್ಥಾನ ಕನ್ನಡಕ್ಕಿದೆ. ೨೦೧೧ರ ಜನಗಣತಿಯ ಪ್ರಕಾರ ಜಗತ್ತಿನಲ್ಲಿ ೬.೪ ಕೋಟಿ ಜನಗಳು ಕನ್ನಡ ಮಾತನಾಡುತ್ತಾರೆ ಎಂದು ತಿಳಿದುಬಂದಿದೆ. ಇವರಲ್ಲಿ ೫.೫ ಕೋಟಿ ಜನಗಳ ಮಾತೃಭಾಷೆ ಕನ್ನಡವಾಗಿದೆ. ಬ್ರಾಹ್ಮಿ ಲಿಪಿಯಿಂದ ರೂಪುಗೊಂಡ ಕನ್ನಡ ಲಿಪಿಯನ್ನು ಉಪಯೋಗಿಸಿ ಕನ್ನಡ ಭಾಷೆಯನ್ನು ಬರೆಯಲಾಗುತ್ತದೆ. ಕನ್ನಡ ಬರಹದ ಮಾದರಿಗಳಿಗೆ ಸಾವಿರದ ಐನೂರು ವರುಷಗಳ ಚರಿತ್ರೆಯಿದೆ. ಕ್ರಿ.ಶ. ಆರನೆಯ ಶತಮಾನದ ಪಶ್ಚಿಮ ಗಂಗ ಸಾಮ್ರಾಜ್ಯದ ಕಾಲದಲ್ಲಿ [೧೨] ಮತ್ತು ಒಂಬತ್ತನೆಯ ಶತಮಾನದ ರಾಷ್ಟ್ರಕೂಟ ಸಾಮ್ರಾಜ್ಯದ ಕಾಲದಲ್ಲಿ ಹಳಗನ್ನಡ ಸಾಹಿತ್ಯ ಅತ್ಯಂತ ಹೆಚ್ಚಿನ ರಾಜಾಶ್ರಯ ಪಡೆಯಿತು.[೧೩][೧೪] ಅದಲ್ಲದೆ ಸಾವಿರ ವರುಷಗಳ ಸಾಹಿತ್ಯ ಪರಂಪರೆ ಕನ್ನಡಕ್ಕಿದೆ.[೧೫]ವಿನೋಬಾ ಭಾವೆ ಕನ್ನಡ ಲಿಪಿಯನ್ನು ಲಿಪಿಗಳ ರಾಣಿಯೆಂದು ಹೊಗಳಿದ್ದಾರೆ.[ಸೂಕ್ತ ಉಲ್ಲೇಖನ ಬೇಕು]'),
|
||||
('si', 'ශ්රී ලංකාවේ ප්රධාන ජාතිය වන සිංහල ජනයාගේ මව් බස සිංහල වෙයි. අද වන විට මිලියන 20 කට අධික සිංහල සහ මිලියන 3කට අධික සිංහල නොවන ජනගහනයක් සිංහල භාෂාව භාවිත කරති. සිංහල ඉන්දු-යුරෝපීය භාෂාවල උප ගණයක් වන ඉන්දු-ආර්ය භාෂා ගණයට අයිති වන අතර මාල දිවයින භාවිත කරන දිවෙහි භාෂාව සිංහලයෙන් පැවත එන්නකි. සිංහල ශ්රී ලංකාවේ නිල භාෂාවයි .'),
|
||||
('ta', 'தமிழ் மொழி (Tamil language) தமிழர்களினதும், தமிழ் பேசும் பலரதும் தாய்மொழி ஆகும். தமிழ் திராவிட மொழிக் குடும்பத்தின் முதன்மையான மொழிகளில் ஒன்றும் செம்மொழியும் ஆகும். இந்தியா, இலங்கை, மலேசியா, சிங்கப்பூர் ஆகிய நாடுகளில் அதிக அளவிலும், ஐக்கிய அரபு அமீரகம், தென்னாப்பிரிக்கா, மொரிசியசு, பிஜி, ரீயூனியன், டிரினிடாட் போன்ற நாடுகளில் சிறிய அளவிலும் தமிழ் பேசப்படுகிறது. 1997ஆம் ஆண்டுப் புள்ளி விவரப்படி உலகம் முழுவதிலும் 8 கோடி (80 மில்லியன்) மக்களால் பேசப்படும் தமிழ்[13], ஒரு மொழியைத் தாய்மொழியாகக் கொண்டு பேசும் மக்களின் எண்ணிக்கை அடிப்படையில் பதினெட்டாவது இடத்தில் உள்ளது.[14] இணையத்தில் அதிகம் பயன்படுத்தப்படும் இந்திய மொழிகளில் தமிழ் முதன்மையாக உள்ளதாக 2017 ஆவது ஆண்டில் நடைபெற்ற கூகுள் கணக்கெடுப்பில் தெரிய வந்தது.[15]'),
|
||||
('te', 'ఆంధ్ర ప్రదేశ్, తెలంగాణ రాష్ట్రాల అధికార భాష తెలుగు. భారత దేశంలో తెలుగు మాతృభాషగా మాట్లాడే 8.7 కోట్ల (2001) జనాభాతో [1] ప్రాంతీయ భాషలలో మొదటి స్థానంలో ఉంది. ప్రపంచంలోని ప్రజలు అత్యధికముగా మాట్లాడే భాషలలో 15 స్థానములోనూ, భారత దేశములో హిందీ, తర్వాత స్థానములోనూ నిలుస్తుంది. పాతవైన ప్రపంచ భాష గణాంకాల (ఎథ్నోలాగ్) ప్రకారం ప్రపంచవ్యాప్తంగా 7.4 కోట్లు మందికి మాతృభాషగా ఉంది.[2] మొదటి భాషగా మాట్లాడతారు. అతి ప్రాచీన దేశ భాషలలో సంస్కృతము తమిళముతో బాటు తెలుగు భాషను 2008 అక్టోబరు 31న భారత ప్రభుత్వము గుర్తించింది.'),
|
||||
('ur', 'اُردُو لشکری زبان[8] (یا جدید معیاری اردو) برصغیر کی معیاری زبانوں میں سے ایک ہے۔ یہ پاکستان کی قومی اور رابطہ عامہ کی زبان ہے، جبکہ بھارت کی چھے ریاستوں کی دفتری زبان کا درجہ رکھتی ہے۔ آئین ہند کے مطابق اسے 22 دفتری شناخت زبانوں میں شامل کیا جاچکا ہے۔ 2001ء کی مردم شماری کے مطابق اردو کو بطور مادری زبان بھارت میں 5.01% فیصد لوگ بولتے ہیں اور اس لحاظ سے یہ بھارت کی چھٹی بڑی زبان ہے جبکہ پاکستان میں اسے بطور مادری زبان 7.59% فیصد لوگ استعمال کرتے ہیں، یہ پاکستان کی پانچویں بڑی زبان ہے۔ اردو تاریخی طور پر ہندوستان کی مسلم آبادی سے جڑی ہے۔[حوالہ درکار] بعض ذخیرہ الفاظ کے علاوہ یہ زبان معیاری ہندی سے قابل فہم ہے جو اس خطے کی ہندوؤں سے منسوب ہے۔[حوالہ درکار] زبانِ اردو کو پہچان و ترقی اس وقت ملی جب برطانوی دور میں انگریز حکمرانوں نے اسے فارسی کی بجائے انگریزی کے ساتھ شمالی ہندوستان کے علاقوں اور جموں و کشمیر میں اسے سنہ 1846ء اور پنجاب میں سنہ 1849ء میں بطور دفتری زبان نافذ کیا۔ اس کے علاوہ خلیجی، یورپی، ایشیائی اور امریکی علاقوں میں اردو بولنے والوں کی ایک بڑی تعداد آباد ہے جو بنیادی طور پر جنوبی ایشیاء سے کوچ کرنے والے اہلِ اردو ہیں۔ 1999ء کے اعداد وشمار کے مطابق اردو زبان کے مجموعی متکلمین کی تعداد دس کروڑ ساٹھ لاکھ کے لگ بھگ تھی۔ اس لحاظ سے یہ دنیا کی نویں بڑی زبان ہے۔'),
|
||||
],
|
||||
# fmt: on
|
||||
)
|
||||
def test_sentencizer_across_scripts(lang, text):
|
||||
nlp = spacy.blank(lang)
|
||||
sentencizer = Sentencizer()
|
||||
nlp.add_pipe(sentencizer)
|
||||
doc = nlp(text)
|
||||
assert len(list(doc.sents)) > 1
|
||||
|
|
|
@ -1,11 +1,16 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import itertools
|
||||
|
||||
import pytest
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.language import Language
|
||||
from spacy.tokens import Doc
|
||||
from spacy.compat import is_python2
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.language import Language
|
||||
from spacy.tokens import Doc, Span
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
from .util import add_vecs_to_vocab, assert_docs_equal
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -58,3 +63,74 @@ def test_language_evaluate(nlp):
|
|||
# Evaluate badly
|
||||
with pytest.raises(Exception):
|
||||
nlp.evaluate([text, gold])
|
||||
|
||||
|
||||
def vector_modification_pipe(doc):
|
||||
doc.vector += 1
|
||||
return doc
|
||||
|
||||
|
||||
def userdata_pipe(doc):
|
||||
doc.user_data["foo"] = "bar"
|
||||
return doc
|
||||
|
||||
|
||||
def ner_pipe(doc):
|
||||
span = Span(doc, 0, 1, label="FIRST")
|
||||
doc.ents += (span,)
|
||||
return doc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_vectors():
|
||||
return [
|
||||
("spacy", [-0.1, -0.2, -0.3]),
|
||||
("world", [-0.2, -0.3, -0.4]),
|
||||
("pipe", [0.7, 0.8, 0.9]),
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def nlp2(nlp, sample_vectors):
|
||||
add_vecs_to_vocab(nlp.vocab, sample_vectors)
|
||||
nlp.add_pipe(vector_modification_pipe)
|
||||
nlp.add_pipe(ner_pipe)
|
||||
nlp.add_pipe(userdata_pipe)
|
||||
return nlp
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def texts():
|
||||
data = [
|
||||
"Hello world.",
|
||||
"This is spacy.",
|
||||
"You can use multiprocessing with pipe method.",
|
||||
"Please try!",
|
||||
]
|
||||
return data
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_process", [1, 2])
|
||||
def test_language_pipe(nlp2, n_process, texts):
|
||||
texts = texts * 10
|
||||
expecteds = [nlp2(text) for text in texts]
|
||||
docs = nlp2.pipe(texts, n_process=n_process, batch_size=2)
|
||||
|
||||
for doc, expected_doc in zip(docs, expecteds):
|
||||
assert_docs_equal(doc, expected_doc)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
is_python2, reason="python2 seems to be unable to handle iterator properly"
|
||||
)
|
||||
@pytest.mark.parametrize("n_process", [1, 2])
|
||||
def test_language_pipe_stream(nlp2, n_process, texts):
|
||||
# check if nlp.pipe can handle infinite length iterator properly.
|
||||
stream_texts = itertools.cycle(texts)
|
||||
texts0, texts1 = itertools.tee(stream_texts)
|
||||
expecteds = (nlp2(text) for text in texts0)
|
||||
docs = nlp2.pipe(texts1, n_process=n_process, batch_size=2)
|
||||
|
||||
n_fetch = 20
|
||||
for doc, expected_doc in itertools.islice(zip(docs, expecteds), n_fetch):
|
||||
assert_docs_equal(doc, expected_doc)
|
||||
|
|
19
spacy/tests/test_register_architecture.py
Normal file
19
spacy/tests/test_register_architecture.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy import register_architecture
|
||||
from spacy import get_architecture
|
||||
from thinc.v2v import Affine
|
||||
|
||||
|
||||
@register_architecture("my_test_function")
|
||||
def create_model(nr_in, nr_out):
|
||||
return Affine(nr_in, nr_out)
|
||||
|
||||
|
||||
def test_get_architecture():
|
||||
arch = get_architecture("my_test_function")
|
||||
assert arch is create_model
|
||||
with pytest.raises(KeyError):
|
||||
get_architecture("not_an_existing_key")
|
|
@ -2,7 +2,6 @@
|
|||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import os
|
||||
import pkg_resources
|
||||
import importlib
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
@ -28,15 +27,21 @@ except ImportError:
|
|||
|
||||
from .symbols import ORTH
|
||||
from .compat import cupy, CudaStream, path2str, basestring_, unicode_
|
||||
from .compat import import_file
|
||||
from .compat import import_file, importlib_metadata
|
||||
from .errors import Errors, Warnings, deprecation_warning
|
||||
|
||||
|
||||
LANGUAGES = {}
|
||||
ARCHITECTURES = {}
|
||||
_data_path = Path(__file__).parent / "data"
|
||||
_PRINT_ENV = False
|
||||
|
||||
|
||||
# NB: Ony ever call this once! If called more than ince within the
|
||||
# function, test_issue1506 hangs and it's not 100% clear why.
|
||||
AVAILABLE_ENTRY_POINTS = importlib_metadata.entry_points()
|
||||
|
||||
|
||||
class ENTRY_POINTS(object):
|
||||
"""Available entry points to register extensions."""
|
||||
|
||||
|
@ -44,6 +49,7 @@ class ENTRY_POINTS(object):
|
|||
languages = "spacy_languages"
|
||||
displacy_colors = "spacy_displacy_colors"
|
||||
lookups = "spacy_lookups"
|
||||
architectures = "spacy_architectures"
|
||||
|
||||
|
||||
def set_env_log(value):
|
||||
|
@ -115,6 +121,44 @@ def set_data_path(path):
|
|||
_data_path = ensure_path(path)
|
||||
|
||||
|
||||
def register_architecture(name, arch=None):
|
||||
"""Decorator to register an architecture. An architecture is a function
|
||||
that returns a Thinc Model object.
|
||||
|
||||
name (unicode): The name of the architecture to register.
|
||||
arch (Model): Optional architecture if function is called directly and
|
||||
not used as a decorator.
|
||||
RETURNS (callable): Function to register architecture.
|
||||
"""
|
||||
global ARCHITECTURES
|
||||
if arch is not None:
|
||||
ARCHITECTURES[name] = arch
|
||||
return arch
|
||||
|
||||
def do_registration(arch):
|
||||
ARCHITECTURES[name] = arch
|
||||
return arch
|
||||
|
||||
return do_registration
|
||||
|
||||
|
||||
def get_architecture(name):
|
||||
"""Get a model architecture function by name. Raises a KeyError if the
|
||||
architecture is not found.
|
||||
|
||||
name (unicode): The mame of the architecture.
|
||||
RETURNS (Model): The architecture.
|
||||
"""
|
||||
# Check if an entry point is exposed for the architecture code
|
||||
entry_point = get_entry_point(ENTRY_POINTS.architectures, name)
|
||||
if entry_point is not None:
|
||||
ARCHITECTURES[name] = entry_point
|
||||
if name not in ARCHITECTURES:
|
||||
names = ", ".join(sorted(ARCHITECTURES.keys()))
|
||||
raise KeyError(Errors.E174.format(name=name, names=names))
|
||||
return ARCHITECTURES[name]
|
||||
|
||||
|
||||
def ensure_path(path):
|
||||
"""Ensure string is converted to a Path.
|
||||
|
||||
|
@ -253,6 +297,8 @@ def is_package(name):
|
|||
name (unicode): Name of package.
|
||||
RETURNS (bool): True if installed package, False if not.
|
||||
"""
|
||||
import pkg_resources
|
||||
|
||||
name = name.lower() # compare package name against lowercase name
|
||||
packages = pkg_resources.working_set.by_key.keys()
|
||||
for package in packages:
|
||||
|
@ -282,7 +328,7 @@ def get_entry_points(key):
|
|||
RETURNS (dict): Entry points, keyed by name.
|
||||
"""
|
||||
result = {}
|
||||
for entry_point in pkg_resources.iter_entry_points(key):
|
||||
for entry_point in AVAILABLE_ENTRY_POINTS.get(key, []):
|
||||
result[entry_point.name] = entry_point.load()
|
||||
return result
|
||||
|
||||
|
@ -296,7 +342,7 @@ def get_entry_point(key, value, default=None):
|
|||
default: Optional default value to return.
|
||||
RETURNS: The loaded entry point or None.
|
||||
"""
|
||||
for entry_point in pkg_resources.iter_entry_points(key):
|
||||
for entry_point in AVAILABLE_ENTRY_POINTS.get(key, []):
|
||||
if entry_point.name == value:
|
||||
return entry_point.load()
|
||||
return default
|
||||
|
|
|
@ -337,7 +337,7 @@ cdef class Vectors:
|
|||
scores[i:i+batch_size] = xp.partition(sims, -n, axis=1)[:,-n:]
|
||||
|
||||
if sort:
|
||||
sorted_index = xp.arange(scores.shape[0])[:,None],xp.argsort(scores, axis=1)[:,::-1]
|
||||
sorted_index = xp.arange(scores.shape[0])[:,None],xp.argsort(scores[i:i+batch_size], axis=1)[:,::-1]
|
||||
scores[i:i+batch_size] = scores[sorted_index]
|
||||
best_rows[i:i+batch_size] = best_rows[sorted_index]
|
||||
|
||||
|
|
|
@ -8,10 +8,10 @@
|
|||
"en_core_web_md",
|
||||
"en_core_web_lg",
|
||||
"en_vectors_web_lg",
|
||||
"en_pytt_bertbaseuncased_lg",
|
||||
"en_pytt_robertabase_lg",
|
||||
"en_pytt_distilbertbaseuncased_lg",
|
||||
"en_pytt_xlnetbasecased_lg"
|
||||
"en_trf_bertbaseuncased_lg",
|
||||
"en_trf_robertabase_lg",
|
||||
"en_trf_distilbertbaseuncased_lg",
|
||||
"en_trf_xlnetbasecased_lg"
|
||||
],
|
||||
"example": "This is a sentence.",
|
||||
"has_examples": true
|
||||
|
@ -19,7 +19,7 @@
|
|||
{
|
||||
"code": "de",
|
||||
"name": "German",
|
||||
"models": ["de_core_news_sm", "de_core_news_md", "de_pytt_bertbasecased_lg"],
|
||||
"models": ["de_core_news_sm", "de_core_news_md", "de_trf_bertbasecased_lg"],
|
||||
"example": "Dies ist ein Satz.",
|
||||
"has_examples": true
|
||||
},
|
||||
|
|
|
@ -1675,21 +1675,21 @@
|
|||
}
|
||||
},
|
||||
{
|
||||
"id": "spacy-pytorch-transformers",
|
||||
"title": "spacy-pytorch-transformers",
|
||||
"id": "spacy-transformers",
|
||||
"title": "spacy-transformers",
|
||||
"slogan": "spaCy pipelines for pretrained BERT, XLNet and GPT-2",
|
||||
"description": "This package provides spaCy model pipelines that wrap [Hugging Face's `pytorch-transformers`](https://github.com/huggingface/pytorch-transformers) package, so you can use them in spaCy. The result is convenient access to state-of-the-art transformer architectures, such as BERT, GPT-2, XLNet, etc.",
|
||||
"github": "explosion/spacy-pytorch-transformers",
|
||||
"url": "https://explosion.ai/blog/spacy-pytorch-transformers",
|
||||
"pip": "spacy-pytorch-transformers",
|
||||
"description": "This package provides spaCy model pipelines that wrap [Hugging Face's `transformers`](https://github.com/huggingface/transformers) package, so you can use them in spaCy. The result is convenient access to state-of-the-art transformer architectures, such as BERT, GPT-2, XLNet, etc.",
|
||||
"github": "explosion/spacy-transformers",
|
||||
"url": "https://explosion.ai/blog/spacy-transformers",
|
||||
"pip": "spacy-transformers",
|
||||
"category": ["pipeline", "models", "research"],
|
||||
"code_example": [
|
||||
"import spacy",
|
||||
"",
|
||||
"nlp = spacy.load(\"en_pytt_bertbaseuncased_lg\")",
|
||||
"nlp = spacy.load(\"en_trf_bertbaseuncased_lg\")",
|
||||
"doc = nlp(\"Apple shares rose on the news. Apple pie is delicious.\")",
|
||||
"print(doc[0].similarity(doc[7]))",
|
||||
"print(doc._.pytt_last_hidden_state.shape)"
|
||||
"print(doc._.trf_last_hidden_state.shape)"
|
||||
],
|
||||
"author": "Explosion",
|
||||
"author_links": {
|
||||
|
|
|
@ -23,6 +23,7 @@ const MODEL_META = {
|
|||
dep: 'Vocabulary, syntax',
|
||||
ent: 'Named entities',
|
||||
pytt: 'PyTorch Transformers',
|
||||
trf: 'Transformers',
|
||||
vectors: 'Word vectors',
|
||||
web: 'written text (blogs, news, comments)',
|
||||
news: 'written text (news, media)',
|
||||
|
|
Loading…
Reference in New Issue
Block a user