mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-14 18:40:33 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
bf6992c2dd
|
@ -6,7 +6,7 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.0rc0,<8.1.0",
|
"thinc>=8.0.0rc2,<8.1.0",
|
||||||
"blis>=0.4.0,<0.8.0",
|
"blis>=0.4.0,<0.8.0",
|
||||||
"pathy"
|
"pathy"
|
||||||
]
|
]
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0rc0,<8.1.0
|
thinc>=8.0.0rc2,<8.1.0
|
||||||
blis>=0.4.0,<0.8.0
|
blis>=0.4.0,<0.8.0
|
||||||
ml_datasets==0.2.0a0
|
ml_datasets==0.2.0a0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
|
|
|
@ -34,13 +34,13 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.0.0rc0,<8.1.0
|
thinc>=8.0.0rc2,<8.1.0
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0rc0,<8.1.0
|
thinc>=8.0.0rc2,<8.1.0
|
||||||
blis>=0.4.0,<0.8.0
|
blis>=0.4.0,<0.8.0
|
||||||
wasabi>=0.8.0,<1.1.0
|
wasabi>=0.8.0,<1.1.0
|
||||||
srsly>=2.3.0,<3.0.0
|
srsly>=2.3.0,<3.0.0
|
||||||
|
|
|
@ -7,7 +7,7 @@ warnings.filterwarnings("ignore", message="numpy.dtype size changed") # noqa
|
||||||
warnings.filterwarnings("ignore", message="numpy.ufunc size changed") # noqa
|
warnings.filterwarnings("ignore", message="numpy.ufunc size changed") # noqa
|
||||||
|
|
||||||
# These are imported as part of the API
|
# These are imported as part of the API
|
||||||
from thinc.api import prefer_gpu, require_gpu # noqa: F401
|
from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
||||||
from . import pipeline # noqa: F401
|
from . import pipeline # noqa: F401
|
||||||
|
|
|
@ -17,7 +17,9 @@ tolerance = 0.2
|
||||||
get_length = null
|
get_length = null
|
||||||
|
|
||||||
[pretraining.objective]
|
[pretraining.objective]
|
||||||
type = "characters"
|
@architectures = "spacy.PretrainCharacters.v1"
|
||||||
|
maxout_pieces = 3
|
||||||
|
hidden_size = 300
|
||||||
n_characters = 4
|
n_characters = 4
|
||||||
|
|
||||||
[pretraining.optimizer]
|
[pretraining.optimizer]
|
||||||
|
|
|
@ -484,8 +484,8 @@ class Errors:
|
||||||
"has been applied.")
|
"has been applied.")
|
||||||
E905 = ("Cannot initialize StaticVectors layer: nM dimension unset. This "
|
E905 = ("Cannot initialize StaticVectors layer: nM dimension unset. This "
|
||||||
"dimension refers to the width of the vectors table.")
|
"dimension refers to the width of the vectors table.")
|
||||||
E906 = ("Unexpected `loss` value in pretraining objective: {loss_type}")
|
E906 = ("Unexpected `loss` value in pretraining objective: '{found}'. Supported values "
|
||||||
E907 = ("Unexpected `objective_type` value in pretraining objective: {objective_type}")
|
"are: {supported}")
|
||||||
E908 = ("Can't set `spaces` without `words` in `Doc.__init__`.")
|
E908 = ("Can't set `spaces` without `words` in `Doc.__init__`.")
|
||||||
E909 = ("Expected {name} in parser internals. This is likely a bug in spaCy.")
|
E909 = ("Expected {name} in parser internals. This is likely a bug in spaCy.")
|
||||||
E910 = ("Encountered NaN value when computing loss for component '{name}'.")
|
E910 = ("Encountered NaN value when computing loss for component '{name}'.")
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
from .entity_linker import * # noqa
|
from .entity_linker import * # noqa
|
||||||
|
from .multi_task import * # noqa
|
||||||
from .parser import * # noqa
|
from .parser import * # noqa
|
||||||
from .tagger import * # noqa
|
from .tagger import * # noqa
|
||||||
from .textcat import * # noqa
|
from .textcat import * # noqa
|
||||||
|
|
|
@ -1,7 +1,14 @@
|
||||||
from typing import Optional, Iterable, Tuple, List, TYPE_CHECKING
|
from typing import Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING
|
||||||
import numpy
|
|
||||||
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
|
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
|
||||||
from thinc.api import MultiSoftmax, list2array
|
from thinc.api import MultiSoftmax, list2array
|
||||||
|
from thinc.api import to_categorical, CosineDistance, L2Distance
|
||||||
|
|
||||||
|
from ...util import registry
|
||||||
|
from ...errors import Errors
|
||||||
|
from ...attrs import ID
|
||||||
|
|
||||||
|
import numpy
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
# This lets us add type hints for mypy etc. without causing circular imports
|
# This lets us add type hints for mypy etc. without causing circular imports
|
||||||
|
@ -9,6 +16,74 @@ if TYPE_CHECKING:
|
||||||
from ...tokens import Doc # noqa: F401
|
from ...tokens import Doc # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.PretrainVectors.v1")
|
||||||
|
def create_pretrain_vectors(
|
||||||
|
maxout_pieces: int, hidden_size: int, loss: str
|
||||||
|
) -> Callable[["Vocab", Model], Model]:
|
||||||
|
def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
|
||||||
|
model = build_cloze_multi_task_model(
|
||||||
|
vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
|
||||||
|
)
|
||||||
|
model.attrs["loss"] = create_vectors_loss()
|
||||||
|
return model
|
||||||
|
|
||||||
|
def create_vectors_loss() -> Callable:
|
||||||
|
if loss == "cosine":
|
||||||
|
distance = CosineDistance(normalize=True, ignore_zeros=True)
|
||||||
|
return partial(get_vectors_loss, distance=distance)
|
||||||
|
elif loss == "L2":
|
||||||
|
distance = L2Distance(normalize=True)
|
||||||
|
return partial(get_vectors_loss, distance=distance)
|
||||||
|
else:
|
||||||
|
raise ValueError(Errors.E906.format(found=loss, supported="'cosine', 'L2'"))
|
||||||
|
|
||||||
|
return create_vectors_objective
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.PretrainCharacters.v1")
|
||||||
|
def create_pretrain_characters(
|
||||||
|
maxout_pieces: int, hidden_size: int, n_characters: int
|
||||||
|
) -> Callable[["Vocab", Model], Model]:
|
||||||
|
def create_characters_objective(vocab: "Vocab", tok2vec: Model) -> Model:
|
||||||
|
model = build_cloze_characters_multi_task_model(
|
||||||
|
vocab,
|
||||||
|
tok2vec,
|
||||||
|
hidden_size=hidden_size,
|
||||||
|
maxout_pieces=maxout_pieces,
|
||||||
|
nr_char=n_characters,
|
||||||
|
)
|
||||||
|
model.attrs["loss"] = partial(get_characters_loss, nr_char=n_characters)
|
||||||
|
return model
|
||||||
|
|
||||||
|
return create_characters_objective
|
||||||
|
|
||||||
|
|
||||||
|
def get_vectors_loss(ops, docs, prediction, distance):
|
||||||
|
"""Compute a loss based on a distance between the documents' vectors and
|
||||||
|
the prediction.
|
||||||
|
"""
|
||||||
|
# The simplest way to implement this would be to vstack the
|
||||||
|
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||||
|
# Instead we fetch the index into the vectors table for each of our tokens,
|
||||||
|
# and look them up all at once. This prevents data copying.
|
||||||
|
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||||
|
target = docs[0].vocab.vectors.data[ids]
|
||||||
|
d_target, loss = distance(prediction, target)
|
||||||
|
return loss, d_target
|
||||||
|
|
||||||
|
|
||||||
|
def get_characters_loss(ops, docs, prediction, nr_char):
|
||||||
|
"""Compute a loss based on a number of characters predicted from the docs."""
|
||||||
|
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
|
||||||
|
target_ids = target_ids.reshape((-1,))
|
||||||
|
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
|
||||||
|
target = target.reshape((-1, 256 * nr_char))
|
||||||
|
diff = prediction - target
|
||||||
|
loss = (diff ** 2).sum()
|
||||||
|
d_target = diff / float(prediction.shape[0])
|
||||||
|
return loss, d_target
|
||||||
|
|
||||||
|
|
||||||
def build_multi_task_model(
|
def build_multi_task_model(
|
||||||
tok2vec: Model,
|
tok2vec: Model,
|
||||||
maxout_pieces: int,
|
maxout_pieces: int,
|
||||||
|
@ -33,23 +108,19 @@ def build_multi_task_model(
|
||||||
|
|
||||||
|
|
||||||
def build_cloze_multi_task_model(
|
def build_cloze_multi_task_model(
|
||||||
vocab: "Vocab",
|
vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int
|
||||||
tok2vec: Model,
|
|
||||||
maxout_pieces: int,
|
|
||||||
hidden_size: int,
|
|
||||||
nO: Optional[int] = None,
|
|
||||||
) -> Model:
|
) -> Model:
|
||||||
# nO = vocab.vectors.data.shape[1]
|
nO = vocab.vectors.data.shape[1]
|
||||||
output_layer = chain(
|
output_layer = chain(
|
||||||
list2array(),
|
list2array(),
|
||||||
Maxout(
|
Maxout(
|
||||||
nO=nO,
|
nO=hidden_size,
|
||||||
nI=tok2vec.get_dim("nO"),
|
nI=tok2vec.get_dim("nO"),
|
||||||
nP=maxout_pieces,
|
nP=maxout_pieces,
|
||||||
normalize=True,
|
normalize=True,
|
||||||
dropout=0.0,
|
dropout=0.0,
|
||||||
),
|
),
|
||||||
Linear(nO=nO, nI=nO, init_W=zero_init),
|
Linear(nO=nO, nI=hidden_size, init_W=zero_init),
|
||||||
)
|
)
|
||||||
model = chain(tok2vec, output_layer)
|
model = chain(tok2vec, output_layer)
|
||||||
model = build_masked_language_model(vocab, model)
|
model = build_masked_language_model(vocab, model)
|
||||||
|
|
|
@ -351,9 +351,7 @@ class ConfigSchemaPretrain(BaseModel):
|
||||||
batcher: Batcher = Field(..., title="Batcher for the training data")
|
batcher: Batcher = Field(..., title="Batcher for the training data")
|
||||||
component: str = Field(..., title="Component to find the layer to pretrain")
|
component: str = Field(..., title="Component to find the layer to pretrain")
|
||||||
layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
|
layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
|
||||||
|
objective: Callable[["Vocab", "Model"], "Model"] = Field(..., title="A function that creates the pretraining objective.")
|
||||||
# TODO: use a more detailed schema for this?
|
|
||||||
objective: Dict[str, Any] = Field(..., title="Pretraining objective")
|
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
|
|
|
@ -3,15 +3,15 @@ from thinc.api import Config, ConfigValidationError
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.de import German
|
from spacy.lang.de import German
|
||||||
from spacy.language import Language, DEFAULT_CONFIG
|
from spacy.language import Language, DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
|
||||||
from spacy.util import registry, load_model_from_config
|
from spacy.util import registry, load_model_from_config, load_config
|
||||||
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
|
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
|
||||||
from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
|
from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
|
||||||
from spacy.schemas import ConfigSchema
|
from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
|
||||||
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
nlp_config_string = """
|
nlp_config_string = """
|
||||||
[paths]
|
[paths]
|
||||||
train = null
|
train = null
|
||||||
|
@ -63,6 +63,59 @@ factory = "tagger"
|
||||||
width = ${components.tok2vec.model.width}
|
width = ${components.tok2vec.model.width}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
pretrain_config_string = """
|
||||||
|
[paths]
|
||||||
|
train = null
|
||||||
|
dev = null
|
||||||
|
|
||||||
|
[corpora]
|
||||||
|
|
||||||
|
[corpora.train]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths.train}
|
||||||
|
|
||||||
|
[corpora.dev]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths.dev}
|
||||||
|
|
||||||
|
[training]
|
||||||
|
|
||||||
|
[training.batcher]
|
||||||
|
@batchers = "spacy.batch_by_words.v1"
|
||||||
|
size = 666
|
||||||
|
|
||||||
|
[nlp]
|
||||||
|
lang = "en"
|
||||||
|
pipeline = ["tok2vec", "tagger"]
|
||||||
|
|
||||||
|
[components]
|
||||||
|
|
||||||
|
[components.tok2vec]
|
||||||
|
factory = "tok2vec"
|
||||||
|
|
||||||
|
[components.tok2vec.model]
|
||||||
|
@architectures = "spacy.HashEmbedCNN.v1"
|
||||||
|
pretrained_vectors = null
|
||||||
|
width = 342
|
||||||
|
depth = 4
|
||||||
|
window_size = 1
|
||||||
|
embed_size = 2000
|
||||||
|
maxout_pieces = 3
|
||||||
|
subword_features = true
|
||||||
|
|
||||||
|
[components.tagger]
|
||||||
|
factory = "tagger"
|
||||||
|
|
||||||
|
[components.tagger.model]
|
||||||
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
|
||||||
|
[components.tagger.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.width}
|
||||||
|
|
||||||
|
[pretraining]
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
parser_config_string = """
|
parser_config_string = """
|
||||||
[model]
|
[model]
|
||||||
|
@ -126,6 +179,14 @@ def test_create_nlp_from_config():
|
||||||
load_model_from_config(Config(bad_cfg), auto_fill=True)
|
load_model_from_config(Config(bad_cfg), auto_fill=True)
|
||||||
|
|
||||||
|
|
||||||
|
def test_create_nlp_from_pretraining_config():
|
||||||
|
"""Test that the default pretraining config validates properly"""
|
||||||
|
config = Config().from_str(pretrain_config_string)
|
||||||
|
pretrain_config = load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
|
||||||
|
filled = config.merge(pretrain_config)
|
||||||
|
resolved = registry.resolve(filled["pretraining"], schema=ConfigSchemaPretrain)
|
||||||
|
|
||||||
|
|
||||||
def test_create_nlp_from_config_multiple_instances():
|
def test_create_nlp_from_config_multiple_instances():
|
||||||
"""Test that the nlp object is created correctly for a config with multiple
|
"""Test that the nlp object is created correctly for a config with multiple
|
||||||
instances of the same component."""
|
instances of the same component."""
|
||||||
|
|
|
@ -4,7 +4,7 @@ import ctypes
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from spacy.about import __version__ as spacy_version
|
from spacy.about import __version__ as spacy_version
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from spacy import prefer_gpu, require_gpu
|
from spacy import prefer_gpu, require_gpu, require_cpu
|
||||||
from spacy.ml._precomputable_affine import PrecomputableAffine
|
from spacy.ml._precomputable_affine import PrecomputableAffine
|
||||||
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
|
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
|
||||||
from spacy.util import dot_to_object, SimpleFrozenList
|
from spacy.util import dot_to_object, SimpleFrozenList
|
||||||
|
@ -15,6 +15,8 @@ from spacy.lang.nl import Dutch
|
||||||
from spacy.language import DEFAULT_CONFIG_PATH
|
from spacy.language import DEFAULT_CONFIG_PATH
|
||||||
from spacy.schemas import ConfigSchemaTraining
|
from spacy.schemas import ConfigSchemaTraining
|
||||||
|
|
||||||
|
from thinc.api import get_current_ops, NumpyOps, CupyOps
|
||||||
|
|
||||||
from .util import get_random_doc
|
from .util import get_random_doc
|
||||||
|
|
||||||
|
|
||||||
|
@ -81,6 +83,8 @@ def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
|
||||||
def test_prefer_gpu():
|
def test_prefer_gpu():
|
||||||
try:
|
try:
|
||||||
import cupy # noqa: F401
|
import cupy # noqa: F401
|
||||||
|
prefer_gpu()
|
||||||
|
assert isinstance(get_current_ops(), CupyOps)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
assert not prefer_gpu()
|
assert not prefer_gpu()
|
||||||
|
|
||||||
|
@ -88,10 +92,24 @@ def test_prefer_gpu():
|
||||||
def test_require_gpu():
|
def test_require_gpu():
|
||||||
try:
|
try:
|
||||||
import cupy # noqa: F401
|
import cupy # noqa: F401
|
||||||
|
require_gpu()
|
||||||
|
assert isinstance(get_current_ops(), CupyOps)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
require_gpu()
|
require_gpu()
|
||||||
|
|
||||||
|
def test_require_cpu():
|
||||||
|
require_cpu()
|
||||||
|
assert isinstance(get_current_ops(), NumpyOps)
|
||||||
|
try:
|
||||||
|
import cupy # noqa: F401
|
||||||
|
require_gpu()
|
||||||
|
assert isinstance(get_current_ops(), CupyOps)
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
require_cpu()
|
||||||
|
assert isinstance(get_current_ops(), NumpyOps)
|
||||||
|
|
||||||
|
|
||||||
def test_ascii_filenames():
|
def test_ascii_filenames():
|
||||||
"""Test that all filenames in the project are ASCII.
|
"""Test that all filenames in the project are ASCII.
|
||||||
|
|
|
@ -2,6 +2,7 @@ import pytest
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.tokenizer import Tokenizer
|
from spacy.tokenizer import Tokenizer
|
||||||
from spacy.util import ensure_path
|
from spacy.util import ensure_path
|
||||||
|
from spacy.lang.en import English
|
||||||
|
|
||||||
|
|
||||||
def test_tokenizer_handles_no_word(tokenizer):
|
def test_tokenizer_handles_no_word(tokenizer):
|
||||||
|
@ -150,6 +151,22 @@ def test_tokenizer_special_cases_with_affixes(tokenizer):
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_special_cases_with_affixes_preserve_spacy():
|
||||||
|
tokenizer = English().tokenizer
|
||||||
|
# reset all special cases
|
||||||
|
tokenizer.rules = {}
|
||||||
|
|
||||||
|
# in-place modification (only merges)
|
||||||
|
text = "''a'' "
|
||||||
|
tokenizer.add_special_case("''", [{"ORTH": "''"}])
|
||||||
|
assert tokenizer(text).text == text
|
||||||
|
|
||||||
|
# not in-place (splits and merges)
|
||||||
|
tokenizer.add_special_case("ab", [{"ORTH": "a"}, {"ORTH": "b"}])
|
||||||
|
text = "ab ab ab ''ab ab'' ab'' ''ab"
|
||||||
|
assert tokenizer(text).text == text
|
||||||
|
|
||||||
|
|
||||||
def test_tokenizer_special_cases_with_period(tokenizer):
|
def test_tokenizer_special_cases_with_period(tokenizer):
|
||||||
text = "_SPECIAL_."
|
text = "_SPECIAL_."
|
||||||
tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
|
tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
|
||||||
|
|
|
@ -514,6 +514,11 @@ def test_roundtrip_docs_to_docbin(doc):
|
||||||
([[0], [1], [2, 3]], [[0], [1], [2], [2]]),
|
([[0], [1], [2, 3]], [[0], [1], [2], [2]]),
|
||||||
),
|
),
|
||||||
([" ", "a"], ["a"], ([[], [0]], [[1]])),
|
([" ", "a"], ["a"], ([[], [0]], [[1]])),
|
||||||
|
(
|
||||||
|
["a", "''", "'", ","],
|
||||||
|
["a'", "''", ","],
|
||||||
|
([[0], [0, 1], [1], [2]], [[0, 1], [1, 2], [3]]),
|
||||||
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_align(tokens_a, tokens_b, expected): # noqa
|
def test_align(tokens_a, tokens_b, expected): # noqa
|
||||||
|
@ -698,7 +703,7 @@ def test_alignment_spaces(en_vocab):
|
||||||
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||||
assert list(align.x2y.lengths) == [0, 3, 1, 1, 1, 1, 1]
|
assert list(align.x2y.lengths) == [0, 3, 1, 1, 1, 1, 1]
|
||||||
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
|
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
|
||||||
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
|
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
|
||||||
assert list(align.y2x.dataXd) == [1, 1, 1, 2, 3, 4, 5, 6]
|
assert list(align.y2x.dataXd) == [1, 1, 1, 2, 3, 4, 5, 6]
|
||||||
|
|
||||||
# multiple leading whitespace tokens
|
# multiple leading whitespace tokens
|
||||||
|
@ -707,7 +712,7 @@ def test_alignment_spaces(en_vocab):
|
||||||
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||||
assert list(align.x2y.lengths) == [0, 0, 3, 1, 1, 1, 1, 1]
|
assert list(align.x2y.lengths) == [0, 0, 3, 1, 1, 1, 1, 1]
|
||||||
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
|
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
|
||||||
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
|
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
|
||||||
assert list(align.y2x.dataXd) == [2, 2, 2, 3, 4, 5, 6, 7]
|
assert list(align.y2x.dataXd) == [2, 2, 2, 3, 4, 5, 6, 7]
|
||||||
|
|
||||||
# both with leading whitespace, not identical
|
# both with leading whitespace, not identical
|
||||||
|
|
|
@ -338,7 +338,7 @@ cdef class Tokenizer:
|
||||||
# Copy special case tokens into doc and adjust token and
|
# Copy special case tokens into doc and adjust token and
|
||||||
# character offsets
|
# character offsets
|
||||||
idx_offset = 0
|
idx_offset = 0
|
||||||
orig_final_spacy = doc.c[span_end + offset - 1].spacy
|
orig_final_spacy = doc.c[span_end - 1].spacy
|
||||||
orig_idx = doc.c[i].idx
|
orig_idx = doc.c[i].idx
|
||||||
for j in range(cached.length):
|
for j in range(cached.length):
|
||||||
tokens[i + offset + j] = cached.data.tokens[j]
|
tokens[i + offset + j] = cached.data.tokens[j]
|
||||||
|
|
|
@ -7,8 +7,8 @@ from ..errors import Errors
|
||||||
|
|
||||||
def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]:
|
def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]:
|
||||||
# Create character-to-token mappings
|
# Create character-to-token mappings
|
||||||
char_to_token_a = tuple(chain(*((i,) * len(x) for i, x in enumerate(A))))
|
char_to_token_a = tuple(chain(*((i,) * len(x.lower()) for i, x in enumerate(A))))
|
||||||
char_to_token_b = tuple(chain(*((i,) * len(x) for i, x in enumerate(B))))
|
char_to_token_b = tuple(chain(*((i,) * len(x.lower()) for i, x in enumerate(B))))
|
||||||
str_a = "".join(A).lower()
|
str_a = "".join(A).lower()
|
||||||
str_b = "".join(B).lower()
|
str_b = "".join(B).lower()
|
||||||
cdef int len_str_a = len(str_a)
|
cdef int len_str_a = len(str_a)
|
||||||
|
@ -36,8 +36,14 @@ def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[Li
|
||||||
if prev_token_idx_b != token_idx_b:
|
if prev_token_idx_b != token_idx_b:
|
||||||
b2a.append(set())
|
b2a.append(set())
|
||||||
# Process the alignment at the current position
|
# Process the alignment at the current position
|
||||||
if A[token_idx_a] == B[token_idx_b]:
|
if A[token_idx_a] == B[token_idx_b] and \
|
||||||
# Current tokens are identical
|
(char_idx_a == 0 or \
|
||||||
|
char_to_token_a[char_idx_a - 1] < token_idx_a) and \
|
||||||
|
(char_idx_b == 0 or \
|
||||||
|
char_to_token_b[char_idx_b - 1] < token_idx_b):
|
||||||
|
# Current tokens are identical and both character offsets are the
|
||||||
|
# start of a token (either at the beginning of the document or the
|
||||||
|
# previous character belongs to a different token)
|
||||||
a2b[-1].add(token_idx_b)
|
a2b[-1].add(token_idx_b)
|
||||||
b2a[-1].add(token_idx_a)
|
b2a[-1].add(token_idx_a)
|
||||||
char_idx_a += len(A[token_idx_a])
|
char_idx_a += len(A[token_idx_a])
|
||||||
|
|
|
@ -28,7 +28,7 @@ def train(
|
||||||
use_gpu: int = -1,
|
use_gpu: int = -1,
|
||||||
stdout: IO = sys.stdout,
|
stdout: IO = sys.stdout,
|
||||||
stderr: IO = sys.stderr,
|
stderr: IO = sys.stderr,
|
||||||
) -> None:
|
) -> Tuple["Language", Optional[Path]]:
|
||||||
"""Train a pipeline.
|
"""Train a pipeline.
|
||||||
|
|
||||||
nlp (Language): The initialized nlp object with the full config.
|
nlp (Language): The initialized nlp object with the full config.
|
||||||
|
@ -40,7 +40,7 @@ def train(
|
||||||
stderr (file): A second file-like object to write output messages. To disable
|
stderr (file): A second file-like object to write output messages. To disable
|
||||||
printing, set to io.StringIO.
|
printing, set to io.StringIO.
|
||||||
|
|
||||||
RETURNS (Path / None): The path to the final exported model.
|
RETURNS (tuple): The final nlp object and the path to the exported model.
|
||||||
"""
|
"""
|
||||||
# We use no_print here so we can respect the stdout/stderr options.
|
# We use no_print here so we can respect the stdout/stderr options.
|
||||||
msg = Printer(no_print=True)
|
msg = Printer(no_print=True)
|
||||||
|
@ -105,17 +105,18 @@ def train(
|
||||||
raise e
|
raise e
|
||||||
finally:
|
finally:
|
||||||
finalize_logger()
|
finalize_logger()
|
||||||
|
if optimizer.averages:
|
||||||
|
nlp.use_params(optimizer.averages)
|
||||||
if output_path is not None:
|
if output_path is not None:
|
||||||
final_model_path = output_path / DIR_MODEL_LAST
|
final_model_path = output_path / DIR_MODEL_LAST
|
||||||
if optimizer.averages:
|
nlp.to_disk(final_model_path)
|
||||||
with nlp.use_params(optimizer.averages):
|
|
||||||
nlp.to_disk(final_model_path)
|
|
||||||
else:
|
|
||||||
nlp.to_disk(final_model_path)
|
|
||||||
# This will only run if we don't hit an error
|
# This will only run if we don't hit an error
|
||||||
stdout.write(
|
stdout.write(
|
||||||
msg.good("Saved pipeline to output directory", final_model_path) + "\n"
|
msg.good("Saved pipeline to output directory", final_model_path) + "\n"
|
||||||
)
|
)
|
||||||
|
return (nlp, final_model_path)
|
||||||
|
else:
|
||||||
|
return (nlp, None)
|
||||||
|
|
||||||
|
|
||||||
def train_while_improving(
|
def train_while_improving(
|
||||||
|
|
|
@ -1,22 +1,16 @@
|
||||||
from typing import Optional, Callable, Iterable, Union, List
|
from typing import Optional, Callable, Iterable, Union, List
|
||||||
from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer
|
from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer
|
||||||
from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance
|
from thinc.api import set_dropout_rate
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from functools import partial
|
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import srsly
|
import srsly
|
||||||
import numpy
|
|
||||||
import time
|
import time
|
||||||
import re
|
import re
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
from .example import Example
|
from .example import Example
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..attrs import ID
|
|
||||||
from ..ml.models.multi_task import build_cloze_multi_task_model
|
|
||||||
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
|
|
||||||
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
|
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
|
||||||
from ..errors import Errors
|
|
||||||
from ..util import registry, load_model_from_config, dot_to_object
|
from ..util import registry, load_model_from_config, dot_to_object
|
||||||
|
|
||||||
|
|
||||||
|
@ -49,6 +43,7 @@ def pretrain(
|
||||||
else:
|
else:
|
||||||
# Without '--resume-path' the '--epoch-resume' argument is ignored
|
# Without '--resume-path' the '--epoch-resume' argument is ignored
|
||||||
epoch_resume = 0
|
epoch_resume = 0
|
||||||
|
objective = model.attrs["loss"]
|
||||||
# TODO: move this to logger function?
|
# TODO: move this to logger function?
|
||||||
tracker = ProgressTracker(frequency=10000)
|
tracker = ProgressTracker(frequency=10000)
|
||||||
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
|
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
|
||||||
|
@ -69,7 +64,6 @@ def pretrain(
|
||||||
with (output_dir / "log.jsonl").open("a") as file_:
|
with (output_dir / "log.jsonl").open("a") as file_:
|
||||||
file_.write(srsly.json_dumps(log) + "\n")
|
file_.write(srsly.json_dumps(log) + "\n")
|
||||||
|
|
||||||
objective = create_objective(P["objective"])
|
|
||||||
# TODO: I think we probably want this to look more like the
|
# TODO: I think we probably want this to look more like the
|
||||||
# 'create_train_batches' function?
|
# 'create_train_batches' function?
|
||||||
for epoch in range(epoch_resume, P["max_epochs"]):
|
for epoch in range(epoch_resume, P["max_epochs"]):
|
||||||
|
@ -132,58 +126,6 @@ def make_update(
|
||||||
return float(loss)
|
return float(loss)
|
||||||
|
|
||||||
|
|
||||||
def create_objective(config: Config):
|
|
||||||
"""Create the objective for pretraining.
|
|
||||||
|
|
||||||
We'd like to replace this with a registry function but it's tricky because
|
|
||||||
we're also making a model choice based on this. For now we hard-code support
|
|
||||||
for two types (characters, vectors). For characters you can specify
|
|
||||||
n_characters, for vectors you can specify the loss.
|
|
||||||
|
|
||||||
Bleh.
|
|
||||||
"""
|
|
||||||
objective_type = config["type"]
|
|
||||||
if objective_type == "characters":
|
|
||||||
return partial(get_characters_loss, nr_char=config["n_characters"])
|
|
||||||
elif objective_type == "vectors":
|
|
||||||
if config["loss"] == "cosine":
|
|
||||||
distance = CosineDistance(normalize=True, ignore_zeros=True)
|
|
||||||
return partial(get_vectors_loss, distance=distance)
|
|
||||||
elif config["loss"] == "L2":
|
|
||||||
distance = L2Distance(normalize=True, ignore_zeros=True)
|
|
||||||
return partial(get_vectors_loss, distance=distance)
|
|
||||||
else:
|
|
||||||
raise ValueError(Errors.E906.format(loss_type=config["loss"]))
|
|
||||||
else:
|
|
||||||
raise ValueError(Errors.E907.format(objective_type=objective_type))
|
|
||||||
|
|
||||||
|
|
||||||
def get_vectors_loss(ops, docs, prediction, distance):
|
|
||||||
"""Compute a loss based on a distance between the documents' vectors and
|
|
||||||
the prediction.
|
|
||||||
"""
|
|
||||||
# The simplest way to implement this would be to vstack the
|
|
||||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
|
||||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
|
||||||
# and look them up all at once. This prevents data copying.
|
|
||||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
|
||||||
target = docs[0].vocab.vectors.data[ids]
|
|
||||||
d_target, loss = distance(prediction, target)
|
|
||||||
return loss, d_target
|
|
||||||
|
|
||||||
|
|
||||||
def get_characters_loss(ops, docs, prediction, nr_char):
|
|
||||||
"""Compute a loss based on a number of characters predicted from the docs."""
|
|
||||||
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
|
|
||||||
target_ids = target_ids.reshape((-1,))
|
|
||||||
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
|
|
||||||
target = target.reshape((-1, 256 * nr_char))
|
|
||||||
diff = prediction - target
|
|
||||||
loss = (diff ** 2).sum()
|
|
||||||
d_target = diff / float(prediction.shape[0])
|
|
||||||
return loss, d_target
|
|
||||||
|
|
||||||
|
|
||||||
def create_pretraining_model(nlp, pretrain_config):
|
def create_pretraining_model(nlp, pretrain_config):
|
||||||
"""Define a network for the pretraining. We simply add an output layer onto
|
"""Define a network for the pretraining. We simply add an output layer onto
|
||||||
the tok2vec input model. The tok2vec input model needs to be a model that
|
the tok2vec input model. The tok2vec input model needs to be a model that
|
||||||
|
@ -192,27 +134,15 @@ def create_pretraining_model(nlp, pretrain_config):
|
||||||
The actual tok2vec layer is stored as a reference, and only this bit will be
|
The actual tok2vec layer is stored as a reference, and only this bit will be
|
||||||
serialized to file and read back in when calling the 'train' command.
|
serialized to file and read back in when calling the 'train' command.
|
||||||
"""
|
"""
|
||||||
|
nlp.initialize()
|
||||||
component = nlp.get_pipe(pretrain_config["component"])
|
component = nlp.get_pipe(pretrain_config["component"])
|
||||||
if pretrain_config.get("layer"):
|
if pretrain_config.get("layer"):
|
||||||
tok2vec = component.model.get_ref(pretrain_config["layer"])
|
tok2vec = component.model.get_ref(pretrain_config["layer"])
|
||||||
else:
|
else:
|
||||||
tok2vec = component.model
|
tok2vec = component.model
|
||||||
|
|
||||||
# TODO
|
create_function = pretrain_config["objective"]
|
||||||
maxout_pieces = 3
|
model = create_function(nlp.vocab, tok2vec)
|
||||||
hidden_size = 300
|
|
||||||
if pretrain_config["objective"]["type"] == "vectors":
|
|
||||||
model = build_cloze_multi_task_model(
|
|
||||||
nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
|
|
||||||
)
|
|
||||||
elif pretrain_config["objective"]["type"] == "characters":
|
|
||||||
model = build_cloze_characters_multi_task_model(
|
|
||||||
nlp.vocab,
|
|
||||||
tok2vec,
|
|
||||||
hidden_size=hidden_size,
|
|
||||||
maxout_pieces=maxout_pieces,
|
|
||||||
nr_char=pretrain_config["objective"]["n_characters"],
|
|
||||||
)
|
|
||||||
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
|
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
|
||||||
set_dropout_rate(model, pretrain_config["dropout"])
|
set_dropout_rate(model, pretrain_config["dropout"])
|
||||||
return model
|
return model
|
||||||
|
|
|
@ -171,6 +171,25 @@ and _before_ loading any pipelines.
|
||||||
| `gpu_id` | Device index to select. Defaults to `0`. ~~int~~ |
|
| `gpu_id` | Device index to select. Defaults to `0`. ~~int~~ |
|
||||||
| **RETURNS** | `True` ~~bool~~ |
|
| **RETURNS** | `True` ~~bool~~ |
|
||||||
|
|
||||||
|
### spacy.require_cpu {#spacy.require_cpu tag="function" new="3.0.0"}
|
||||||
|
|
||||||
|
Allocate data and perform operations on CPU.
|
||||||
|
If data has already been allocated on GPU, it will not
|
||||||
|
be moved. Ideally, this function should be called right after importing spaCy
|
||||||
|
and _before_ loading any pipelines.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> import spacy
|
||||||
|
> spacy.require_cpu()
|
||||||
|
> nlp = spacy.load("en_core_web_sm")
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------------------ |
|
||||||
|
| **RETURNS** | `True` ~~bool~~ |
|
||||||
|
|
||||||
## displaCy {#displacy source="spacy/displacy"}
|
## displaCy {#displacy source="spacy/displacy"}
|
||||||
|
|
||||||
As of v2.0, spaCy comes with a built-in visualization suite. For more info and
|
As of v2.0, spaCy comes with a built-in visualization suite. For more info and
|
||||||
|
|
|
@ -158,29 +158,37 @@ The other way to install spaCy is to clone its
|
||||||
source. That is the common way if you want to make changes to the code base.
|
source. That is the common way if you want to make changes to the code base.
|
||||||
You'll need to make sure that you have a development environment consisting of a
|
You'll need to make sure that you have a development environment consisting of a
|
||||||
Python distribution including header files, a compiler,
|
Python distribution including header files, a compiler,
|
||||||
[pip](https://pip.pypa.io/en/latest/installing/),
|
[pip](https://pip.pypa.io/en/stable/) and [git](https://git-scm.com) installed.
|
||||||
[virtualenv](https://virtualenv.pypa.io/) and [git](https://git-scm.com)
|
The compiler part is the trickiest. How to do that depends on your system. See
|
||||||
installed. The compiler part is the trickiest. How to do that depends on your
|
notes on [Ubuntu](#source-ubuntu), [macOS / OS X](#source-osx) and
|
||||||
system. See notes on [Ubuntu](#source-ubuntu), [macOS / OS X](#source-osx) and
|
|
||||||
[Windows](#source-windows) for details.
|
[Windows](#source-windows) for details.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ python -m pip install -U pip # update pip
|
$ python -m pip install -U pip setuptools wheel # install/update build tools
|
||||||
$ git clone https://github.com/explosion/spaCy # clone spaCy
|
$ git clone https://github.com/explosion/spaCy # clone spaCy
|
||||||
$ cd spaCy # navigate into dir
|
$ cd spaCy # navigate into dir
|
||||||
|
|
||||||
$ python -m venv .env # create environment in .env
|
$ python -m venv .env # create environment in .env
|
||||||
$ source .env/bin/activate # activate virtual env
|
$ source .env/bin/activate # activate virtual env
|
||||||
$ export PYTHONPATH=`pwd` # set Python path to spaCy dir
|
$ pip install . # compile and install spaCy
|
||||||
$ pip install -r requirements.txt # install all requirements
|
|
||||||
$ python setup.py build_ext --inplace # compile spaCy
|
|
||||||
$ python setup.py install # install spaCy
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Compared to regular install via pip, the
|
To install with extras:
|
||||||
[`requirements.txt`](%%GITHUB_SPACY/requirements.txt) additionally installs
|
|
||||||
developer dependencies such as Cython. See the [quickstart widget](#quickstart)
|
```bash
|
||||||
to get the right commands for your platform and Python version.
|
$ pip install .[lookups,cuda102] # install spaCy with extras
|
||||||
|
```
|
||||||
|
|
||||||
|
To install all dependencies required for development:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
Compared to a regular install via pip, the
|
||||||
|
[`requirements.txt`](%%GITHUB_SPACY/requirements.txt) additionally includes
|
||||||
|
developer dependencies such as Cython and the libraries required to run the test
|
||||||
|
suite. See the [quickstart widget](#quickstart) to get the right commands for
|
||||||
|
your platform and Python version.
|
||||||
|
|
||||||
<a id="source-ubuntu"></a><a id="source-osx"></a><a id="source-windows"></a>
|
<a id="source-ubuntu"></a><a id="source-osx"></a><a id="source-windows"></a>
|
||||||
|
|
||||||
|
@ -195,6 +203,32 @@ to get the right commands for your platform and Python version.
|
||||||
[Visual Studio Express](https://www.visualstudio.com/vs/visual-studio-express/)
|
[Visual Studio Express](https://www.visualstudio.com/vs/visual-studio-express/)
|
||||||
that matches the version that was used to compile your Python interpreter.
|
that matches the version that was used to compile your Python interpreter.
|
||||||
|
|
||||||
|
#### Additional options for developers {#source-developers}
|
||||||
|
|
||||||
|
Some additional options may be useful for spaCy developers who are editing the
|
||||||
|
source code and recompiling frequently.
|
||||||
|
|
||||||
|
- Install in editable mode. Changes to `.py` files will be reflected as soon as
|
||||||
|
the files are saved, but edits to Cython files (`.pxd`, `.pyx`) will require
|
||||||
|
the `pip install` or `python setup.py build_ext` command below to be run
|
||||||
|
again. Before installing in editable mode, be sure you have removed any
|
||||||
|
previous installs with `pip uninstall spacy`, which you may need to run
|
||||||
|
multiple times to remove all traces of earlier installs.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ pip install -r requirements.txt
|
||||||
|
$ pip install --no-build-isolation --editable .
|
||||||
|
```
|
||||||
|
|
||||||
|
- Build in parallel using `N` CPUs to speed up compilation and then install in
|
||||||
|
editable mode:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ pip install -r requirements.txt
|
||||||
|
$ python setup.py build_ext --inplace -j N
|
||||||
|
$ pip install --no-build-isolation --editable .
|
||||||
|
```
|
||||||
|
|
||||||
### Building an executable {#executable}
|
### Building an executable {#executable}
|
||||||
|
|
||||||
The spaCy repository includes a [`Makefile`](%%GITHUB_SPACY/Makefile) that
|
The spaCy repository includes a [`Makefile`](%%GITHUB_SPACY/Makefile) that
|
||||||
|
|
Loading…
Reference in New Issue
Block a user