From 60e8da481300da3540138d2689f73324a07b071b Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Wed, 20 May 2020 12:56:27 +0200 Subject: [PATCH 01/39] Tidy up train-from-config a bit --- spacy/cli/train_from_config.py | 38 +++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index 429a3cf49..c75c861cc 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -193,10 +193,11 @@ def train_from_config( optimizer, train_batches, evaluate, - training["dropout"], - training["patience"], - training["eval_frequency"], - training["accumulate_gradient"] + dropout=training["dropout"], + accumulate_gradient=training["accumulate_gradient"], + patience=training.get("patience", 0), + max_steps=training.get("max_steps", 0), + eval_frequency=training["eval_frequency"], ) msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") @@ -214,17 +215,17 @@ def train_from_config( progress = tqdm.tqdm(total=training["eval_frequency"], leave=False) finally: if output_path is not None: - with nlp.use_params(optimizer.averages): - final_model_path = output_path / "model-final" + final_model_path = output_path / "model-final" + if optimizer.averages: + with nlp.use_params(optimizer.averages): + nlp.to_disk(final_model_path) + else: nlp.to_disk(final_model_path) msg.good("Saved model to output directory", final_model_path) - # with msg.loading("Creating best model..."): - # best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names) - # msg.good("Created best model", best_model_path) def create_train_batches(nlp, corpus, cfg): - is_first = True + epochs_todo = cfg.get("max_epochs", 0) while True: train_examples = list(corpus.train_dataset( nlp, @@ -240,6 +241,11 @@ def create_train_batches(nlp, corpus, cfg): batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"]) for batch in batches: yield batch + epochs_todo -= 1 + # We intentionally compare exactly to 0 here, so that max_epochs < 1 + # will not break. + if epochs_todo == 0: + break def create_evaluation_callback(nlp, optimizer, corpus, cfg): @@ -270,8 +276,8 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg): def train_while_improving( - nlp, optimizer, train_data, evaluate, dropout, patience, eval_frequency, - accumulate_gradient + nlp, optimizer, train_data, evaluate, *, dropout, eval_frequency, + accumulate_gradient=1, patience=0, max_steps=0 ): """Train until an evaluation stops improving. Works as a generator, with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`, @@ -281,6 +287,7 @@ def train_while_improving( Positional arguments: nlp: The spaCy pipeline to evaluate. + optimizer: The optimizer callable. train_data (Iterable[Batch]): A generator of batches, with the training data. Each batch should be a Sized[Tuple[Input, Annot]]. The training data iterable needs to take care of iterating over the epochs and @@ -344,9 +351,12 @@ def train_while_improving( yield batch, info, is_best_checkpoint if is_best_checkpoint is not None: losses = {} - # Stop if no improvement in `patience` updates + # Stop if no improvement in `patience` updates (if specified) best_score, best_step = max(results) - if (step - best_step) >= patience: + if patience and (step - best_step) >= patience: + break + # Stop if we've exhausted our max steps (if specified) + if max_steps and (step * accumulate_gradient) >= max_steps: break From f075655debdd35e2cd648bd845b8b966edb5c733 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 May 2020 19:26:29 +0200 Subject: [PATCH 02/39] Fix shape inference in begin_training --- spacy/ml/models/parser.py | 3 +-- spacy/ml/models/tagger.py | 3 +-- spacy/ml/tb_framework.py | 4 ++-- spacy/pipeline/pipes.pyx | 7 ++++++- spacy/syntax/nn_parser.pyx | 4 ++++ 5 files changed, 14 insertions(+), 7 deletions(-) diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index 710d36a1d..0e0857ca8 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -15,10 +15,9 @@ def build_tb_parser_model( use_upper=True, nO=None, ): - token_vector_width = tok2vec.get_dim("nO") tok2vec = chain( tok2vec, - with_array(Linear(hidden_width, token_vector_width)), + with_array(Linear(hidden_width)), list2array(), ) tok2vec.set_dim("nO", hidden_width) diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py index 683c8b518..87256cb5c 100644 --- a/spacy/ml/models/tagger.py +++ b/spacy/ml/models/tagger.py @@ -6,9 +6,8 @@ from ...util import registry @registry.architectures.register("spacy.Tagger.v1") def build_tagger_model(tok2vec, nO=None) -> Model: - token_vector_width = tok2vec.get_dim("nO") # TODO: glorot_uniform_init seems to work a bit better than zero_init here?! - output_layer = Softmax(nO, nI=token_vector_width, init_W=zero_init) + output_layer = Softmax(nO, init_W=zero_init) softmax = with_array(output_layer) model = chain(tok2vec, softmax) model.set_ref("tok2vec", tok2vec) diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index e4301a644..251189389 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -38,8 +38,8 @@ def forward(model, X, is_train): def init(model, X=None, Y=None): - tok2vec = model.get_ref("tok2vec").initialize() - lower = model.get_ref("lower").initialize(X=X) + tok2vec = model.get_ref("tok2vec").initialize(X=X) + lower = model.get_ref("lower").initialize() if model.attrs["has_upper"]: statevecs = model.ops.alloc2f(2, lower.get_dim("nO")) model.get_ref("upper").initialize(X=statevecs) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 56fe54664..00c8894fd 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -531,7 +531,12 @@ class Tagger(Pipe): vocab.morphology.lemmatizer, exc=vocab.morphology.exc) self.set_output(len(self.labels)) - self.model.initialize() + doc_sample = [Doc(self.vocab, words=["hello", "world"])] + for name, component in pipeline: + if component is self: + break + doc_sample = list(component.pipe(doc_sample)) + self.model.initialize(X=doc_sample) # Get batch of example docs, example outputs to call begin_training(). # This lets the model infer shapes. link_vectors_to_models(self.vocab) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 31aa4d413..94369a828 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -629,6 +629,10 @@ cdef class Parser: for doc, gold in parses: doc_sample.append(doc) gold_sample.append(gold) + for name, component in pipeline: + if component is self: + break + doc_sample = list(component.pipe(doc_sample)) self.model.initialize(doc_sample, gold_sample) if pipeline is not None: self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) From 3b5cfec1fcf34e45d86fd2b133120be13141488a Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 May 2020 19:32:04 +0200 Subject: [PATCH 03/39] Tweak memory management in train_from_config --- spacy/cli/train_from_config.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index c75c861cc..eeb21c10c 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -213,6 +213,12 @@ def train_from_config( if is_best_checkpoint and output_path is not None: nlp.to_disk(output_path) progress = tqdm.tqdm(total=training["eval_frequency"], leave=False) + # Clean up the objects to faciliate garbage collection. + for eg in batch: + eg.doc = None + eg.goldparse = None + eg.doc_annotation = None + eg.token_annotation = None finally: if output_path is not None: final_model_path = output_path / "model-final" From df87c32a4068484471f5ce53b1f4eb7e4f9e4c43 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 May 2020 20:17:24 +0200 Subject: [PATCH 04/39] Pass smaller doc sample into model initialize --- spacy/syntax/nn_parser.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 94369a828..ed4697302 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -624,11 +624,12 @@ cdef class Parser: sgd = self.create_optimizer() doc_sample = [] gold_sample = [] - for example in islice(get_examples(), 1000): + for example in islice(get_examples(), 10): parses = example.get_gold_parses(merge=False, vocab=self.vocab) for doc, gold in parses: - doc_sample.append(doc) - gold_sample.append(gold) + if len(doc): + doc_sample.append(doc) + gold_sample.append(gold) for name, component in pipeline: if component is self: break From d507ac28d8db197e8eac6b8c420ef3502af0a006 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 May 2020 20:46:10 +0200 Subject: [PATCH 05/39] Fix shape inference --- spacy/ml/models/parser.py | 3 ++- spacy/ml/models/tagger.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index 0e0857ca8..bdcd709b1 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -15,9 +15,10 @@ def build_tb_parser_model( use_upper=True, nO=None, ): + t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None tok2vec = chain( tok2vec, - with_array(Linear(hidden_width)), + with_array(Linear(hidden_width, t2v_width)), list2array(), ) tok2vec.set_dim("nO", hidden_width) diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py index 87256cb5c..00e268ede 100644 --- a/spacy/ml/models/tagger.py +++ b/spacy/ml/models/tagger.py @@ -7,7 +7,8 @@ from ...util import registry @registry.architectures.register("spacy.Tagger.v1") def build_tagger_model(tok2vec, nO=None) -> Model: # TODO: glorot_uniform_init seems to work a bit better than zero_init here?! - output_layer = Softmax(nO, init_W=zero_init) + t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None + output_layer = Softmax(nO, t2v_width, init_W=zero_init) softmax = with_array(output_layer) model = chain(tok2vec, softmax) model.set_ref("tok2vec", tok2vec) From bc94fdabd0ec7362a68f38aa8cbb0b80f818f243 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 May 2020 20:46:21 +0200 Subject: [PATCH 06/39] Fix begin_training --- spacy/pipeline/pipes.pyx | 12 ++++++++---- spacy/syntax/nn_parser.pyx | 18 +++++++++++++----- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 00c8894fd..f75ed1659 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -532,10 +532,14 @@ class Tagger(Pipe): exc=vocab.morphology.exc) self.set_output(len(self.labels)) doc_sample = [Doc(self.vocab, words=["hello", "world"])] - for name, component in pipeline: - if component is self: - break - doc_sample = list(component.pipe(doc_sample)) + if pipeline is not None: + for name, component in pipeline: + if component is self: + break + if hasattr(component, "pipe"): + doc_sample = list(component.pipe(doc_sample)) + else: + doc_sample = [component(doc) for doc in doc_sample] self.model.initialize(X=doc_sample) # Get batch of example docs, example outputs to call begin_training(). # This lets the model infer shapes. diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index ed4697302..f8e819268 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -630,11 +630,19 @@ cdef class Parser: if len(doc): doc_sample.append(doc) gold_sample.append(gold) - for name, component in pipeline: - if component is self: - break - doc_sample = list(component.pipe(doc_sample)) - self.model.initialize(doc_sample, gold_sample) + + if pipeline is not None: + for name, component in pipeline: + if component is self: + break + if hasattr(component, "pipe"): + doc_sample = list(component.pipe(doc_sample)) + else: + doc_sample = [component(doc) for doc in doc_sample] + if doc_sample: + self.model.initialize(doc_sample) + else: + self.model.initialize() if pipeline is not None: self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) link_vectors_to_models(self.vocab) From 25b51f4fc8a102fd1c83d62d078f071823f222eb Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 May 2020 20:47:52 +0200 Subject: [PATCH 07/39] Set version to v3.0.0.dev9 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 3af1b77a0..04a660ad1 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.0.0.dev8" +__version__ = "3.0.0.dev9" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 6e6db6afb62a0377bcd5f0c64220ad05f512c073 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 22 May 2020 15:42:46 +0200 Subject: [PATCH 08/39] Better model compatibility and validation --- requirements.txt | 1 + setup.cfg | 1 + spacy/cli/info.py | 4 ++- spacy/cli/package.py | 5 ++-- spacy/cli/train.py | 2 +- spacy/cli/validate.py | 39 +++++++++++++++++---------- spacy/language.py | 2 +- spacy/util.py | 61 +++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 96 insertions(+), 19 deletions(-) diff --git a/requirements.txt b/requirements.txt index e5f1ae10b..c43ffa7bb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,6 +13,7 @@ numpy>=1.15.0 requests>=2.13.0,<3.0.0 plac>=0.9.6,<1.2.0 tqdm>=4.38.0,<5.0.0 +importlib_metadata>=0.20; python_version < "3.8" # Optional dependencies jsonschema>=2.6.0,<3.1.0 pydantic>=1.3.0,<2.0.0 diff --git a/setup.cfg b/setup.cfg index 1cd088279..eb7608c4e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -56,6 +56,7 @@ install_requires = requests>=2.13.0,<3.0.0 pydantic>=1.3.0,<2.0.0 tqdm>=4.38.0,<5.0.0 + importlib_metadata>=0.20; python_version < "3.8" [options.extras_require] lookups = diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 23f766368..d779eb2b3 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -48,7 +48,9 @@ def info( "Location": str(Path(__file__).parent.parent), "Platform": platform.platform(), "Python version": platform.python_version(), - "Models": ", ".join(model["name"] for model in all_models.values()), + "Models": ", ".join( + f"{m['name']} ({m['version']})" for m in all_models.values() + ), } if not silent: title = "Info about spaCy" diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 8e27e44d0..cf93c872f 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -83,14 +83,14 @@ def generate_meta(model_path, existing_meta, msg): ("lang", "Model language", meta.get("lang", "en")), ("name", "Model name", meta.get("name", "model")), ("version", "Model version", meta.get("version", "0.0.0")), - ("spacy_version", "Required spaCy version", f">={about.__version__},<3.0.0"), ("description", "Model description", meta.get("description", False)), ("author", "Author", meta.get("author", False)), ("email", "Author email", meta.get("email", False)), ("url", "Author website", meta.get("url", False)), - ("license", "License", meta.get("license", "CC BY-SA 3.0")), + ("license", "License", meta.get("license", "MIT")), ] nlp = util.load_model_from_path(Path(model_path)) + meta["spacy_version"] = about.__version__ meta["pipeline"] = nlp.pipe_names meta["vectors"] = { "width": nlp.vocab.vectors_length, @@ -168,6 +168,7 @@ def setup_package(): package_data={model_name: list_files(model_dir)}, install_requires=list_requirements(meta), zip_safe=False, + entry_points={'spacy_models': ['{m} = {m}'.format(m=model_name)]} ) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 19e0a81e0..c205fa5b2 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -467,7 +467,7 @@ def train( # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names - meta["spacy_version"] = f">={about.__version__}" + meta["spacy_version"] = about.__version__ if beam_width == 1: meta["speed"] = { "nwords": nwords, diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index a23ce3453..c39cadc7b 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -4,6 +4,8 @@ import requests from wasabi import msg from .. import about +from ..util import get_package_version, get_installed_models, split_version +from ..util import get_package_path, get_model_meta, is_compatible_model def validate(): @@ -25,7 +27,7 @@ def validate(): msg.info(f"spaCy installation: {spacy_dir}") if model_pkgs: - header = ("NAME", "VERSION", "") + header = ("NAME", "SPACY", "VERSION", "") rows = [] for name, data in model_pkgs.items(): if data["compat"]: @@ -34,7 +36,7 @@ def validate(): else: version = msg.text(data["version"], color="red", no_print=True) comp = f"--> {compat.get(data['name'], ['n/a'])[0]}" - rows.append((data["name"], version, comp)) + rows.append((data["name"], data["spacy"], version, comp)) msg.table(rows, header=header) else: msg.text("No models found in your current environment.", exits=0) @@ -44,8 +46,9 @@ def validate(): cmd = "python -m spacy download {}" print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n") if na_models: - msg.warn( - f"The following models are not available for spaCy v{about.__version__}:", + msg.info( + f"The following models are custom spaCy models or not " + f"available for spaCy v{about.__version__}:", ", ".join(na_models), ) if incompat_models: @@ -53,8 +56,6 @@ def validate(): def get_model_pkgs(): - import pkg_resources - with msg.loading("Loading compatibility table..."): r = requests.get(about.__compatibility__) if r.status_code != 200: @@ -66,20 +67,30 @@ def get_model_pkgs(): msg.good("Loaded compatibility table") compat = r.json()["spacy"] all_models = set() + installed_models = get_installed_models() for spacy_v, models in dict(compat).items(): all_models.update(models.keys()) for model, model_vs in models.items(): compat[spacy_v][model] = [reformat_version(v) for v in model_vs] pkgs = {} - for pkg_name, pkg_data in pkg_resources.working_set.by_key.items(): + for pkg_name in installed_models: package = pkg_name.replace("-", "_") - if package in all_models: - version = pkg_data.version - pkgs[pkg_name] = { - "name": package, - "version": version, - "compat": package in compat and version in compat[package], - } + version = get_package_version(pkg_name) + if package in compat: + is_compat = version in compat[package] + v_maj, v_min = split_version(about.__version__) + spacy_version = f"{v_maj}.{v_min}" + else: + model_path = get_package_path(package) + model_meta = get_model_meta(model_path) + is_compat = is_compatible_model(model_meta) + spacy_version = model_meta.get("spacy_version", "n/a") + pkgs[pkg_name] = { + "name": package, + "version": version, + "spacy": spacy_version, + "compat": is_compat, + } return pkgs, compat diff --git a/spacy/language.py b/spacy/language.py index d71c27406..f770cda2c 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -196,7 +196,7 @@ class Language(object): self._meta.setdefault("lang", self.lang) self._meta.setdefault("name", "model") self._meta.setdefault("version", "0.0.0") - self._meta.setdefault("spacy_version", f">={about.__version__}") + self._meta.setdefault("spacy_version", about.__version__) self._meta.setdefault("description", "") self._meta.setdefault("author", "") self._meta.setdefault("email", "") diff --git a/spacy/util.py b/spacy/util.py index 7f35c2f7c..5a7c633fa 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -21,9 +21,16 @@ try: except ImportError: cupy = None +try: # Python 3.8 + import importlib.metadata as importlib_metadata +except ImportError: + import importlib_metadata + from .symbols import ORTH from .compat import cupy, CudaStream from .errors import Errors, Warnings +from . import about + _PRINT_ENV = False @@ -35,6 +42,10 @@ class registry(thinc.registry): factories = catalogue.create("spacy", "factories", entry_points=True) displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True) assets = catalogue.create("spacy", "assets", entry_points=True) + # This is mostly used to get a list of all installed models in the current + # environment. spaCy models packaged with `spacy package` will "advertise" + # themselves via entry points. + models = catalogue.create("spacy", "models", entry_points=True) def set_env_log(value): @@ -204,6 +215,56 @@ def load_model_from_init_py(init_file, **overrides): return load_model_from_path(data_path, meta, **overrides) +def get_installed_models(): + """List all model packages currently installed in the environment. + + RETURNS (list): The string names of the models. + """ + return list(registry.models.get_all().keys()) + + +def get_package_version(name): + """Get the version of an installed package. Typically used to get model + package versions. + + name (unicode): The name of the installed Python package. + RETURNS (unicode / None): The version or None if package not installed. + """ + try: + return importlib_metadata.version(name) + except importlib_metadata.PackageNotFoundError: + return None + + +def split_version(version): + """RETURNS (tuple): Two integers, the major and minor spaCy version.""" + pieces = version.split(".", 3) + return int(pieces[0]), int(pieces[1]) + + +def is_compatible_model(meta): + """Check if a model is compatible with the current version of spaCy, based + on its meta.json. We compare the version of spaCy the model was created with + with the current version. If the minor version is different, it's considered + incompatible. + + meta (dict): The model's meta. + RETURNS (bool / None): Whether the model is compatible with the current + spaCy or None if we don't have enough info. + """ + cur_v = about.__version__ + pkg_v = meta.get("spacy_version") + if not pkg_v or not isinstance(pkg_v, str): + return None + # Handle spacy_version values like >=x, Date: Fri, 22 May 2020 15:55:45 +0200 Subject: [PATCH 09/39] Guess set_annotations=True in nlp.update During `nlp.update`, components can be passed a boolean set_annotations to indicate whether they should assign annotations to the `Doc`. This needs to be called if downstream components expect to use the annotations during training, e.g. if we wanted to use tagger features in the parser. Components can specify their assignments and requirements, so we can figure out which components have these inter-dependencies. After figuring this out, we can guess whether to pass set_annotations=True. We could also call set_annotations=True always, or even just have this as the only behaviour. The downside of this is that it would require the `Doc` objects to be created afresh to avoid problematic modifications. One approach would be to make a fresh copy of the `Doc` objects within `nlp.update()`, so that we can write to the objects without any problems. If we do that, we can drop this logic and also drop the `set_annotations` mechanism. I would be fine with that approach, although it runs the risk of introducing some performance overhead, and we'll have to take care to copy all extension attributes etc. --- spacy/language.py | 24 +++++++++++++++++++++-- spacy/tests/pipeline/test_pipe_methods.py | 18 ++++++++++++++++- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index d71c27406..afc988583 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -545,13 +545,14 @@ class Language(object): if component_cfg is None: component_cfg = {} + component_deps = _count_pipeline_inter_dependencies(self.pipeline) # Determine whether component should set annotations. In theory I guess # we should do this by inspecting the meta? Or we could just always # say "yes" - for name, proc in self.pipeline: + for i, (name, proc) in enumerate(self.pipeline): component_cfg.setdefault(name, {}) component_cfg[name].setdefault("drop", drop) - component_cfg[name].setdefault("set_annotations", False) + component_cfg[name]["set_annotations"] = bool(component_deps[i]) for name, proc in self.pipeline: if not hasattr(proc, "update"): continue @@ -1159,6 +1160,25 @@ class DisabledPipes(list): self[:] = [] +def _count_pipeline_inter_dependencies(pipeline): + """Count how many subsequent components require an annotation set by each + component in the pipeline. + """ + pipe_assigns = [] + pipe_requires = [] + for name, pipe in pipeline: + pipe_assigns.append(set(getattr(pipe, "assigns", []))) + pipe_requires.append(set(getattr(pipe, "requires", []))) + counts = [] + for i, assigns in enumerate(pipe_assigns): + count = 0 + for requires in pipe_requires[i+1:]: + if assigns.intersection(requires): + count += 1 + counts.append(count) + return counts + + def _pipe(examples, proc, kwargs): # We added some args for pipe that __call__ doesn't expect. kwargs = dict(kwargs) diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index d42216655..0397d490d 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -1,5 +1,5 @@ import pytest -from spacy.language import Language +from spacy.language import Language, _count_pipeline_inter_dependencies @pytest.fixture @@ -198,3 +198,19 @@ def test_pipe_labels(nlp): assert len(nlp.pipe_labels) == len(input_labels) for name, labels in nlp.pipe_labels.items(): assert sorted(input_labels[name]) == sorted(labels) + + +def test_pipe_inter_dependencies(): + class Fancifier: + name = "fancifier" + assigns = ("doc._.fancy",) + requires = tuple() + + class FancyNeeder: + name = "needer" + assigns = tuple() + requires = ("doc._.fancy",) + + pipeline = [("fancifier", Fancifier()), ("needer", FancyNeeder())] + counts = _count_pipeline_inter_dependencies(pipeline) + assert counts == [1, 0] From f7f6df7275ea2884fc47fa7823c6bcba1caa5cb4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 22 May 2020 16:43:18 +0200 Subject: [PATCH 10/39] Move to spacy.analysis --- spacy/analysis.py | 21 +++++++++++++++++++++ spacy/language.py | 22 ++-------------------- spacy/tests/pipeline/test_analysis.py | 17 +++++++++++++++++ spacy/tests/pipeline/test_pipe_methods.py | 18 +----------------- 4 files changed, 41 insertions(+), 37 deletions(-) diff --git a/spacy/analysis.py b/spacy/analysis.py index c2600048f..41591661c 100644 --- a/spacy/analysis.py +++ b/spacy/analysis.py @@ -173,3 +173,24 @@ def print_summary(nlp, pretty=True, no_print=False): msg.good("No problems found.") if no_print: return {"overview": overview, "problems": problems} + + +def count_pipeline_interdependencies(pipeline): + """Count how many subsequent components require an annotation set by each + component in the pipeline. + """ + pipe_assigns = [] + pipe_requires = [] + for name, pipe in pipeline: + pipe_assigns.append(set(getattr(pipe, "assigns", []))) + pipe_requires.append(set(getattr(pipe, "requires", []))) + counts = [] + for i, assigns in enumerate(pipe_assigns): + count = 0 + for requires in pipe_requires[i+1:]: + if assigns.intersection(requires): + count += 1 + counts.append(count) + return counts + + diff --git a/spacy/language.py b/spacy/language.py index afc988583..b228c2155 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -18,6 +18,7 @@ from .vocab import Vocab from .lemmatizer import Lemmatizer from .lookups import Lookups from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs +from .analysis import count_pipeline_interdependencies from .gold import Example from .scorer import Scorer from .util import link_vectors_to_models, create_default_optimizer, registry @@ -545,7 +546,7 @@ class Language(object): if component_cfg is None: component_cfg = {} - component_deps = _count_pipeline_inter_dependencies(self.pipeline) + component_deps = count_pipeline_interdependencies(self.pipeline) # Determine whether component should set annotations. In theory I guess # we should do this by inspecting the meta? Or we could just always # say "yes" @@ -1160,25 +1161,6 @@ class DisabledPipes(list): self[:] = [] -def _count_pipeline_inter_dependencies(pipeline): - """Count how many subsequent components require an annotation set by each - component in the pipeline. - """ - pipe_assigns = [] - pipe_requires = [] - for name, pipe in pipeline: - pipe_assigns.append(set(getattr(pipe, "assigns", []))) - pipe_requires.append(set(getattr(pipe, "requires", []))) - counts = [] - for i, assigns in enumerate(pipe_assigns): - count = 0 - for requires in pipe_requires[i+1:]: - if assigns.intersection(requires): - count += 1 - counts.append(count) - return counts - - def _pipe(examples, proc, kwargs): # We added some args for pipe that __call__ doesn't expect. kwargs = dict(kwargs) diff --git a/spacy/tests/pipeline/test_analysis.py b/spacy/tests/pipeline/test_analysis.py index cda39f6ee..e608f2c34 100644 --- a/spacy/tests/pipeline/test_analysis.py +++ b/spacy/tests/pipeline/test_analysis.py @@ -2,6 +2,7 @@ import spacy.language from spacy.language import Language, component from spacy.analysis import print_summary, validate_attrs from spacy.analysis import get_assigns_for_attr, get_requires_for_attr +from spacy.analysis import count_pipeline_interdependencies from mock import Mock, ANY import pytest @@ -161,3 +162,19 @@ def test_analysis_validate_attrs_remove_pipe(): with pytest.warns(None) as record: nlp.remove_pipe("c2") assert not record.list + + +def test_pipe_interdependencies(): + class Fancifier: + name = "fancifier" + assigns = ("doc._.fancy",) + requires = tuple() + + class FancyNeeder: + name = "needer" + assigns = tuple() + requires = ("doc._.fancy",) + + pipeline = [("fancifier", Fancifier()), ("needer", FancyNeeder())] + counts = count_pipeline_interdependencies(pipeline) + assert counts == [1, 0] diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 0397d490d..d42216655 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -1,5 +1,5 @@ import pytest -from spacy.language import Language, _count_pipeline_inter_dependencies +from spacy.language import Language @pytest.fixture @@ -198,19 +198,3 @@ def test_pipe_labels(nlp): assert len(nlp.pipe_labels) == len(input_labels) for name, labels in nlp.pipe_labels.items(): assert sorted(input_labels[name]) == sorted(labels) - - -def test_pipe_inter_dependencies(): - class Fancifier: - name = "fancifier" - assigns = ("doc._.fancy",) - requires = tuple() - - class FancyNeeder: - name = "needer" - assigns = tuple() - requires = ("doc._.fancy",) - - pipeline = [("fancifier", Fancifier()), ("needer", FancyNeeder())] - counts = _count_pipeline_inter_dependencies(pipeline) - assert counts == [1, 0] From 12b7be1d9874048c1f3f20dffb833a88308544c4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 22 May 2020 16:49:26 +0200 Subject: [PATCH 11/39] Remove jsonschema from dependencies --- Makefile | 4 ++-- requirements.txt | 2 -- spacy/tests/package/test_requirements.py | 1 - 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index cf96d6294..9916e3cf5 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ VENV := ./env$(PYVER) version := $(shell "bin/get-version.sh") dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp - $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy_lookups_data + $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) spacy_lookups_data chmod a+rx $@ dist/pytest.pex : wheelhouse/pytest-*.whl @@ -14,7 +14,7 @@ dist/pytest.pex : wheelhouse/pytest-*.whl wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py* $(VENV)/bin/pip wheel . -w ./wheelhouse - $(VENV)/bin/pip wheel jsonschema spacy_lookups_data -w ./wheelhouse + $(VENV)/bin/pip wheel spacy_lookups_data -w ./wheelhouse touch $@ wheelhouse/pytest-%.whl : $(VENV)/bin/pex diff --git a/requirements.txt b/requirements.txt index c43ffa7bb..add083a05 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,8 +14,6 @@ requests>=2.13.0,<3.0.0 plac>=0.9.6,<1.2.0 tqdm>=4.38.0,<5.0.0 importlib_metadata>=0.20; python_version < "3.8" -# Optional dependencies -jsonschema>=2.6.0,<3.1.0 pydantic>=1.3.0,<2.0.0 # Development dependencies cython>=0.25 diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py index 59a8569ee..0dc0f9d6c 100644 --- a/spacy/tests/package/test_requirements.py +++ b/spacy/tests/package/test_requirements.py @@ -9,7 +9,6 @@ def test_build_dependencies(): "pytest-timeout", "mock", "flake8", - "jsonschema", ] libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"] From d844528c5f62f27904d6925f16cc7d1ee3e16949 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 22 May 2020 16:55:15 +0200 Subject: [PATCH 12/39] Add test for is_compatible_model --- spacy/tests/test_misc.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index c320b19c0..0a0f4c7be 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -2,6 +2,7 @@ import pytest import os import ctypes from pathlib import Path +from spacy.about import __version__ as spacy_version from spacy import util from spacy import prefer_gpu, require_gpu from spacy.ml._precomputable_affine import PrecomputableAffine, _backprop_precomputable_affine_padding @@ -87,3 +88,11 @@ def test_ascii_filenames(): root = Path(__file__).parent.parent for path in root.glob("**/*"): assert all(ord(c) < 128 for c in path.name), path.name + + +@pytest.mark.parametrize( + "version,compatible", + [(spacy_version, True), ("2.0.0", False), (">=1.2.3,<4.5.6", False)], +) +def test_is_compatible_model(version, compatible): + assert util.is_compatible_model({"spacy_version": version}) is compatible From 569a65b60e4205311817d1a8add57fa16b407de7 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 22 May 2020 16:55:42 +0200 Subject: [PATCH 13/39] Auto-format --- spacy/tests/test_misc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 0a0f4c7be..ddf1bb332 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -5,7 +5,8 @@ from pathlib import Path from spacy.about import __version__ as spacy_version from spacy import util from spacy import prefer_gpu, require_gpu -from spacy.ml._precomputable_affine import PrecomputableAffine, _backprop_precomputable_affine_padding +from spacy.ml._precomputable_affine import PrecomputableAffine +from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding @pytest.fixture From 4465cad6c5bc188f628dc92183e2e855e26bcfc4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 22 May 2020 17:42:06 +0200 Subject: [PATCH 14/39] Rename spacy.analysis to spacy.pipe_analysis --- spacy/language.py | 23 ++++++++++++++++++----- spacy/{analysis.py => pipe_analysis.py} | 4 +--- spacy/tests/pipeline/test_analysis.py | 8 ++++---- 3 files changed, 23 insertions(+), 12 deletions(-) rename spacy/{analysis.py => pipe_analysis.py} (99%) diff --git a/spacy/language.py b/spacy/language.py index 8c44cf26b..5286bd3b9 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -17,8 +17,8 @@ from .tokens.underscore import Underscore from .vocab import Vocab from .lemmatizer import Lemmatizer from .lookups import Lookups -from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs -from .analysis import count_pipeline_interdependencies +from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs +from .pipe_analysis import count_pipeline_interdependencies from .gold import Example from .scorer import Scorer from .util import link_vectors_to_models, create_default_optimizer, registry @@ -318,14 +318,18 @@ class Language(object): # check whether we have a proper model config, or load a default one if "model" in factory_cfg and not isinstance(factory_cfg["model"], dict): - warnings.warn(Warnings.W099.format(type=type(factory_cfg["model"]), pipe=name)) + warnings.warn( + Warnings.W099.format(type=type(factory_cfg["model"]), pipe=name) + ) # refer to the model configuration in the cfg settings for this component if "model" in factory_cfg: self.config[name] = {"model": factory_cfg["model"]} # create all objects in the config - factory_cfg = registry.make_from_config({"config": factory_cfg}, validate=True)["config"] + factory_cfg = registry.make_from_config({"config": factory_cfg}, validate=True)[ + "config" + ] model = factory_cfg.get("model", None) if model is not None: del factory_cfg["model"] @@ -519,7 +523,16 @@ class Language(object): def make_doc(self, text): return self.tokenizer(text) - def update(self, examples, dummy=None, *, drop=0.0, sgd=None, losses=None, component_cfg=None): + def update( + self, + examples, + dummy=None, + *, + drop=0.0, + sgd=None, + losses=None, + component_cfg=None, + ): """Update the models in the pipeline. examples (iterable): A batch of `Example` or `Doc` objects. diff --git a/spacy/analysis.py b/spacy/pipe_analysis.py similarity index 99% rename from spacy/analysis.py rename to spacy/pipe_analysis.py index 41591661c..4c0950453 100644 --- a/spacy/analysis.py +++ b/spacy/pipe_analysis.py @@ -187,10 +187,8 @@ def count_pipeline_interdependencies(pipeline): counts = [] for i, assigns in enumerate(pipe_assigns): count = 0 - for requires in pipe_requires[i+1:]: + for requires in pipe_requires[i + 1 :]: if assigns.intersection(requires): count += 1 counts.append(count) return counts - - diff --git a/spacy/tests/pipeline/test_analysis.py b/spacy/tests/pipeline/test_analysis.py index e608f2c34..b826438f5 100644 --- a/spacy/tests/pipeline/test_analysis.py +++ b/spacy/tests/pipeline/test_analysis.py @@ -1,8 +1,8 @@ import spacy.language from spacy.language import Language, component -from spacy.analysis import print_summary, validate_attrs -from spacy.analysis import get_assigns_for_attr, get_requires_for_attr -from spacy.analysis import count_pipeline_interdependencies +from spacy.pipe_analysis import print_summary, validate_attrs +from spacy.pipe_analysis import get_assigns_for_attr, get_requires_for_attr +from spacy.pipe_analysis import count_pipeline_interdependencies from mock import Mock, ANY import pytest @@ -169,7 +169,7 @@ def test_pipe_interdependencies(): name = "fancifier" assigns = ("doc._.fancy",) requires = tuple() - + class FancyNeeder: name = "needer" assigns = tuple() From 2d9de8684df7d28477986eb497e13b403c03d9d9 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Fri, 22 May 2020 23:10:40 +0200 Subject: [PATCH 15/39] Support use_pytorch_for_gpu_memory config --- spacy/cli/train_from_config.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index eeb21c10c..c0e3bd169 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -7,7 +7,7 @@ from pathlib import Path from wasabi import msg import thinc import thinc.schedules -from thinc.api import Model +from thinc.api import Model, use_pytorch_for_gpu_memory import random from ..gold import GoldCorpus @@ -171,6 +171,8 @@ def train_from_config( msg.info(f"Loading config from: {config_path}") config = util.load_config(config_path, create_objects=False) util.fix_random_seed(config["training"]["seed"]) + if config["training"]["use_pytorch_for_gpu_memory"]: + use_pytorch_for_gpu_memory() nlp_config = config["nlp"] config = util.load_config(config_path, create_objects=True) msg.info("Creating nlp from config") From f9786d765edf16afa092cf378a0a45fb321efe22 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 24 May 2020 14:48:56 +0200 Subject: [PATCH 16/39] Simplify is_package check --- spacy/cli/download.py | 18 ++---------------- spacy/util.py | 13 +++++-------- 2 files changed, 7 insertions(+), 24 deletions(-) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 0230e272d..af132bbbe 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -5,6 +5,7 @@ import sys from wasabi import msg from .. import about +from ..util import is_package def download( @@ -17,7 +18,7 @@ def download( flag is set, the command expects the full model name with version. For direct downloads, the compatibility check will be skipped. """ - if not require_package("spacy") and "--no-deps" not in pip_args: + if not is_package("spacy") and "--no-deps" not in pip_args: msg.warn( "Skipping model package dependencies and setting `--no-deps`. " "You don't seem to have the spaCy package itself installed " @@ -45,21 +46,6 @@ def download( "Download and installation successful", f"You can now load the model via spacy.load('{model_name}')", ) - # If a model is downloaded and then loaded within the same process, our - # is_package check currently fails, because pkg_resources.working_set - # is not refreshed automatically (see #3923). We're trying to work - # around this here be requiring the package explicitly. - require_package(model_name) - - -def require_package(name): - try: - import pkg_resources - - pkg_resources.working_set.require(name) - return True - except: # noqa: E722 - return False def get_json(url, desc): diff --git a/spacy/util.py b/spacy/util.py index 5a7c633fa..41af881c9 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -341,14 +341,11 @@ def is_package(name): name (unicode): Name of package. RETURNS (bool): True if installed package, False if not. """ - import pkg_resources - - name = name.lower() # compare package name against lowercase name - packages = pkg_resources.working_set.by_key.keys() - for package in packages: - if package.lower().replace("-", "_") == name: - return True - return False + try: + importlib_metadata.distribution(name) + return True + except: # noqa: E722 + return False def get_package_path(name): From 387c7aba15228557cdbbfae0ee3ab90009769584 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 24 May 2020 14:55:16 +0200 Subject: [PATCH 17/39] Update test --- spacy/tests/test_misc.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index ddf1bb332..9e67ae83b 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -26,10 +26,12 @@ def test_util_ensure_path_succeeds(text): assert isinstance(path, Path) -@pytest.mark.parametrize("package", ["numpy"]) -def test_util_is_package(package): +@pytest.mark.parametrize( + "package,result", [("numpy", True), ("sfkodskfosdkfpsdpofkspdof", False)] +) +def test_util_is_package(package, result): """Test that an installed package via pip is recognised by util.is_package.""" - assert util.is_package(package) + assert util.is_package(package) is result @pytest.mark.parametrize("package", ["thinc"]) From 5d3806e059178c9516fb6cf57064cb10cfbf0f29 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 24 May 2020 17:20:58 +0200 Subject: [PATCH 18/39] unicode -> str consistency --- spacy/cli/converters/conllu2json.py | 8 ++-- spacy/displacy/__init__.py | 8 ++-- spacy/displacy/render.py | 26 +++++------ spacy/errors.py | 2 +- spacy/glossary.py | 4 +- spacy/kb.pyx | 4 +- spacy/language.py | 26 +++++------ spacy/lemmatizer.py | 10 ++--- spacy/lexeme.pyx | 20 ++++----- spacy/lookups.py | 28 ++++++------ spacy/matcher/dependencymatcher.pyx | 2 +- spacy/matcher/matcher.pyx | 6 +-- spacy/matcher/phrasematcher.pyx | 6 +-- spacy/morphology.pyx | 4 +- spacy/pipe_analysis.py | 6 +-- spacy/pipeline/entityruler.py | 4 +- spacy/pipeline/functions.py | 2 +- spacy/strings.pyx | 6 +-- spacy/tokenizer.pyx | 16 +++---- spacy/tokens/doc.pyx | 14 +++--- spacy/tokens/span.pyx | 22 +++++----- spacy/tokens/token.pyx | 44 +++++++++---------- spacy/util.py | 36 +++++++-------- spacy/vectors.pyx | 8 ++-- spacy/vocab.pyx | 6 +-- website/docs/api/lexeme.md | 16 +++---- website/docs/api/vocab.md | 30 ++++++------- website/docs/usage/rule-based-matching.md | 53 +++++++++++++---------- 28 files changed, 212 insertions(+), 205 deletions(-) diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index 0b2920802..1ece755b8 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -111,8 +111,8 @@ def get_entities(lines, tag_pattern, ner_map=None): final entity type with `ner_map` if mapping present. Entity tag is 'O' if the pattern is not matched. - lines (unicode): CONLL-U lines for one sentences - tag_pattern (unicode): Regex pattern for entity tag + lines (str): CONLL-U lines for one sentences + tag_pattern (str): Regex pattern for entity tag ner_map (dict): Map old NER tag names to new ones, '' maps to O. RETURNS (list): List of BILUO entity tags """ @@ -187,8 +187,8 @@ def example_from_conllu_sentence( """Create an Example from the lines for one CoNLL-U sentence, merging subtokens and appending morphology to tags if required. - lines (unicode): The non-comment lines for a CoNLL-U sentence - ner_tag_pattern (unicode): The regex pattern for matching NER in MISC col + lines (str): The non-comment lines for a CoNLL-U sentence + ner_tag_pattern (str): The regex pattern for matching NER in MISC col RETURNS (Example): An example containing the annotation """ # create a Doc with each subtoken as its own token diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index 3f84dabce..2c377a043 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -22,13 +22,13 @@ def render( """Render displaCy visualisation. docs (list or Doc): Document(s) to visualise. - style (unicode): Visualisation style, 'dep' or 'ent'. + style (str): Visualisation style, 'dep' or 'ent'. page (bool): Render markup as full HTML page. minify (bool): Minify HTML markup. jupyter (bool): Override Jupyter auto-detection. options (dict): Visualiser-specific options, e.g. colors. manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts. - RETURNS (unicode): Rendered HTML markup. + RETURNS (str): Rendered HTML markup. DOCS: https://spacy.io/api/top-level#displacy.render USAGE: https://spacy.io/usage/visualizers @@ -73,13 +73,13 @@ def serve( """Serve displaCy visualisation. docs (list or Doc): Document(s) to visualise. - style (unicode): Visualisation style, 'dep' or 'ent'. + style (str): Visualisation style, 'dep' or 'ent'. page (bool): Render markup as full HTML page. minify (bool): Minify HTML markup. options (dict): Visualiser-specific options, e.g. colors. manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts. port (int): Port to serve visualisation. - host (unicode): Host to serve visualisation. + host (str): Host to serve visualisation. DOCS: https://spacy.io/api/top-level#displacy.serve USAGE: https://spacy.io/usage/visualizers diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index 0d4cdb77f..d3572ce78 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -47,7 +47,7 @@ class DependencyRenderer(object): parsed (list): Dependency parses to render. page (bool): Render parses wrapped as full HTML page. minify (bool): Minify HTML markup. - RETURNS (unicode): Rendered SVG or HTML markup. + RETURNS (str): Rendered SVG or HTML markup. """ # Create a random ID prefix to make sure parses don't receive the # same ID, even if they're identical @@ -78,7 +78,7 @@ class DependencyRenderer(object): render_id (int): Unique ID, typically index of document. words (list): Individual words and their tags. arcs (list): Individual arcs and their start, end, direction and label. - RETURNS (unicode): Rendered SVG markup. + RETURNS (str): Rendered SVG markup. """ self.levels = self.get_levels(arcs) self.highest_level = len(self.levels) @@ -112,10 +112,10 @@ class DependencyRenderer(object): ): """Render individual word. - text (unicode): Word text. - tag (unicode): Part-of-speech tag. + text (str): Word text. + tag (str): Part-of-speech tag. i (int): Unique ID, typically word index. - RETURNS (unicode): Rendered SVG markup. + RETURNS (str): Rendered SVG markup. """ y = self.offset_y + self.word_spacing x = self.offset_x + i * self.distance @@ -131,12 +131,12 @@ class DependencyRenderer(object): def render_arrow(self, label, start, end, direction, i): """Render individual arrow. - label (unicode): Dependency label. + label (str): Dependency label. start (int): Index of start word. end (int): Index of end word. - direction (unicode): Arrow direction, 'left' or 'right'. + direction (str): Arrow direction, 'left' or 'right'. i (int): Unique ID, typically arrow index. - RETURNS (unicode): Rendered SVG markup. + RETURNS (str): Rendered SVG markup. """ if start < 0 or end < 0: error_args = dict(start=start, end=end, label=label, dir=direction) @@ -179,7 +179,7 @@ class DependencyRenderer(object): y (int): Y-coordinate of arrow start and end point. y_curve (int): Y-corrdinate of Cubic Bézier y_curve point. x_end (int): X-coordinate of arrow end point. - RETURNS (unicode): Definition of the arc path ('d' attribute). + RETURNS (str): Definition of the arc path ('d' attribute). """ template = "M{x},{y} C{x},{c} {e},{c} {e},{y}" if self.compact: @@ -189,11 +189,11 @@ class DependencyRenderer(object): def get_arrowhead(self, direction, x, y, end): """Render individual arrow head. - direction (unicode): Arrow direction, 'left' or 'right'. + direction (str): Arrow direction, 'left' or 'right'. x (int): X-coordinate of arrow start point. y (int): Y-coordinate of arrow start and end point. end (int): X-coordinate of arrow end point. - RETURNS (unicode): Definition of the arrow head path ('d' attribute). + RETURNS (str): Definition of the arrow head path ('d' attribute). """ if direction == "left": pos1, pos2, pos3 = (x, x - self.arrow_width + 2, x + self.arrow_width - 2) @@ -279,7 +279,7 @@ class EntityRenderer(object): parsed (list): Dependency parses to render. page (bool): Render parses wrapped as full HTML page. minify (bool): Minify HTML markup. - RETURNS (unicode): Rendered HTML markup. + RETURNS (str): Rendered HTML markup. """ rendered = [] for i, p in enumerate(parsed): @@ -300,7 +300,7 @@ class EntityRenderer(object): def render_ents(self, text, spans, title): """Render entities in text. - text (unicode): Original text. + text (str): Original text. spans (list): Individual entity spans and their start, end and label. title (unicode or None): Document title set in Doc.user_data['title']. """ diff --git a/spacy/errors.py b/spacy/errors.py index 4d38ab586..932bb1eff 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -598,7 +598,7 @@ class MatchPatternError(ValueError): def __init__(self, key, errors): """Custom error for validating match patterns. - key (unicode): The name of the matcher rule. + key (str): The name of the matcher rule. errors (dict): Validation errors (sequence of strings) mapped to pattern ID, i.e. the index of the added pattern. """ diff --git a/spacy/glossary.py b/spacy/glossary.py index 938a575cd..c4a6a5c45 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -1,8 +1,8 @@ def explain(term): """Get a description for a given POS tag, dependency label or entity type. - term (unicode): The term to explain. - RETURNS (unicode): The explanation, or `None` if not found in the glossary. + term (str): The term to explain. + RETURNS (str): The explanation, or `None` if not found in the glossary. EXAMPLE: >>> spacy.explain(u'NORP') diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 86a8d49b8..8d8464f3c 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -38,7 +38,7 @@ cdef class Candidate: @property def entity_(self): - """RETURNS (unicode): ID/name of this entity in the KB""" + """RETURNS (str): ID/name of this entity in the KB""" return self.kb.vocab.strings[self.entity_hash] @property @@ -48,7 +48,7 @@ cdef class Candidate: @property def alias_(self): - """RETURNS (unicode): ID of the original alias""" + """RETURNS (str): ID of the original alias""" return self.kb.vocab.strings[self.alias_hash] @property diff --git a/spacy/language.py b/spacy/language.py index 5286bd3b9..e3b770723 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -122,7 +122,7 @@ class Language(object): Defaults (class): Settings, data and factory methods for creating the `nlp` object and processing pipeline. - lang (unicode): Two-letter language ID, i.e. ISO code. + lang (str): Two-letter language ID, i.e. ISO code. DOCS: https://spacy.io/api/language """ @@ -287,7 +287,7 @@ class Language(object): def get_pipe(self, name): """Get a pipeline component for a given component name. - name (unicode): Name of pipeline component to get. + name (str): Name of pipeline component to get. RETURNS (callable): The pipeline component. DOCS: https://spacy.io/api/language#get_pipe @@ -300,7 +300,7 @@ class Language(object): def create_pipe(self, name, config=dict()): """Create a pipeline component from a factory. - name (unicode): Factory name to look up in `Language.factories`. + name (str): Factory name to look up in `Language.factories`. config (dict): Configuration parameters to initialise component. RETURNS (callable): Pipeline component. @@ -343,12 +343,12 @@ class Language(object): of before/after/first/last can be set. Default behaviour is "last". component (callable): The pipeline component. - name (unicode): Name of pipeline component. Overwrites existing + name (str): Name of pipeline component. Overwrites existing component.name attribute if available. If no name is set and the component exposes no name attribute, component.__name__ is used. An error is raised if a name already exists in the pipeline. - before (unicode): Component name to insert component directly before. - after (unicode): Component name to insert component directly after. + before (str): Component name to insert component directly before. + after (str): Component name to insert component directly after. first (bool): Insert component first / not first in the pipeline. last (bool): Insert component last / not last in the pipeline. @@ -389,7 +389,7 @@ class Language(object): """Check if a component name is present in the pipeline. Equivalent to `name in nlp.pipe_names`. - name (unicode): Name of the component. + name (str): Name of the component. RETURNS (bool): Whether a component of the name exists in the pipeline. DOCS: https://spacy.io/api/language#has_pipe @@ -399,7 +399,7 @@ class Language(object): def replace_pipe(self, name, component): """Replace a component in the pipeline. - name (unicode): Name of the component to replace. + name (str): Name of the component to replace. component (callable): Pipeline component. DOCS: https://spacy.io/api/language#replace_pipe @@ -418,8 +418,8 @@ class Language(object): def rename_pipe(self, old_name, new_name): """Rename a pipeline component. - old_name (unicode): Name of the component to rename. - new_name (unicode): New name of the component. + old_name (str): Name of the component to rename. + new_name (str): New name of the component. DOCS: https://spacy.io/api/language#rename_pipe """ @@ -433,7 +433,7 @@ class Language(object): def remove_pipe(self, name): """Remove a component from the pipeline. - name (unicode): Name of the component to remove. + name (str): Name of the component to remove. RETURNS (tuple): A `(name, component)` tuple of the removed component. DOCS: https://spacy.io/api/language#remove_pipe @@ -450,7 +450,7 @@ class Language(object): and can contain arbitrary whitespace. Alignment into the original string is preserved. - text (unicode): The text to be processed. + text (str): The text to be processed. disable (list): Names of the pipeline components to disable. component_cfg (dict): An optional dictionary with extra keyword arguments for specific components. @@ -1086,7 +1086,7 @@ class component(object): ): """Decorate a pipeline component. - name (unicode): Default component and factory name. + name (str): Default component and factory name. assigns (list): Attributes assigned by component, e.g. `["token.pos"]`. requires (list): Attributes required by component, e.g. `["token.dep"]`. retokenizes (bool): Whether the component changes the tokenization. diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 3ba86c169..aeedbde84 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -29,8 +29,8 @@ class Lemmatizer(object): def __call__(self, string, univ_pos, morphology=None): """Lemmatize a string. - string (unicode): The string to lemmatize, e.g. the token text. - univ_pos (unicode / int): The token's universal part-of-speech tag. + string (str): The string to lemmatize, e.g. the token text. + univ_pos (str / int): The token's universal part-of-speech tag. morphology (dict): The token's morphological features following the Universal Dependencies scheme. RETURNS (list): The available lemmas for the string. @@ -69,7 +69,7 @@ class Lemmatizer(object): Check whether we're dealing with an uninflected paradigm, so we can avoid lemmatization entirely. - univ_pos (unicode / int): The token's universal part-of-speech tag. + univ_pos (str / int): The token's universal part-of-speech tag. morphology (dict): The token's morphological features following the Universal Dependencies scheme. """ @@ -128,10 +128,10 @@ class Lemmatizer(object): """Look up a lemma in the table, if available. If no lemma is found, the original string is returned. - string (unicode): The original string. + string (str): The original string. orth (int): Optional hash of the string to look up. If not set, the string will be used and hashed. - RETURNS (unicode): The lemma if the string was found, otherwise the + RETURNS (str): The lemma if the string was found, otherwise the original string. """ lookup_table = self.lookups.get_table("lemma_lookup", {}) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 20e175f03..911112d50 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -190,7 +190,7 @@ cdef class Lexeme: self.vocab.set_vector(self.c.orth, vector) property rank: - """RETURNS (unicode): Sequential ID of the lexemes's lexical type, used + """RETURNS (str): Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors.""" def __get__(self): return self.c.id @@ -209,18 +209,18 @@ cdef class Lexeme: @property def orth_(self): - """RETURNS (unicode): The original verbatim text of the lexeme + """RETURNS (str): The original verbatim text of the lexeme (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes.""" return self.vocab.strings[self.c.orth] @property def text(self): - """RETURNS (unicode): The original verbatim text of the lexeme.""" + """RETURNS (str): The original verbatim text of the lexeme.""" return self.orth_ property lower: - """RETURNS (unicode): Lowercase form of the lexeme.""" + """RETURNS (str): Lowercase form of the lexeme.""" def __get__(self): return self.c.lower @@ -293,7 +293,7 @@ cdef class Lexeme: self.c.prob = x property lower_: - """RETURNS (unicode): Lowercase form of the word.""" + """RETURNS (str): Lowercase form of the word.""" def __get__(self): return self.vocab.strings[self.c.lower] @@ -301,7 +301,7 @@ cdef class Lexeme: self.c.lower = self.vocab.strings.add(x) property norm_: - """RETURNS (unicode): The lexemes's norm, i.e. a normalised form of the + """RETURNS (str): The lexemes's norm, i.e. a normalised form of the lexeme text. """ def __get__(self): @@ -311,7 +311,7 @@ cdef class Lexeme: self.c.norm = self.vocab.strings.add(x) property shape_: - """RETURNS (unicode): Transform of the word's string, to show + """RETURNS (str): Transform of the word's string, to show orthographic features. """ def __get__(self): @@ -321,7 +321,7 @@ cdef class Lexeme: self.c.shape = self.vocab.strings.add(x) property prefix_: - """RETURNS (unicode): Length-N substring from the start of the word. + """RETURNS (str): Length-N substring from the start of the word. Defaults to `N=1`. """ def __get__(self): @@ -331,7 +331,7 @@ cdef class Lexeme: self.c.prefix = self.vocab.strings.add(x) property suffix_: - """RETURNS (unicode): Length-N substring from the end of the word. + """RETURNS (str): Length-N substring from the end of the word. Defaults to `N=3`. """ def __get__(self): @@ -341,7 +341,7 @@ cdef class Lexeme: self.c.suffix = self.vocab.strings.add(x) property lang_: - """RETURNS (unicode): Language of the parent vocabulary.""" + """RETURNS (str): Language of the parent vocabulary.""" def __get__(self): return self.vocab.strings[self.c.lang] diff --git a/spacy/lookups.py b/spacy/lookups.py index a9d371b79..5661897e1 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -31,7 +31,7 @@ class Lookups(object): """Check if the lookups contain a table of a given name. Delegates to Lookups.has_table. - name (unicode): Name of the table. + name (str): Name of the table. RETURNS (bool): Whether a table of that name is in the lookups. """ return self.has_table(name) @@ -48,7 +48,7 @@ class Lookups(object): def add_table(self, name, data=SimpleFrozenDict()): """Add a new table to the lookups. Raises an error if the table exists. - name (unicode): Unique name of table. + name (str): Unique name of table. data (dict): Optional data to add to the table. RETURNS (Table): The newly added table. @@ -64,7 +64,7 @@ class Lookups(object): """Get a table. Raises an error if the table doesn't exist and no default value is provided. - name (unicode): Name of the table. + name (str): Name of the table. default: Optional default value to return if table doesn't exist. RETURNS (Table): The table. @@ -79,7 +79,7 @@ class Lookups(object): def remove_table(self, name): """Remove a table. Raises an error if the table doesn't exist. - name (unicode): Name of the table to remove. + name (str): Name of the table to remove. RETURNS (Table): The removed table. DOCS: https://spacy.io/api/lookups#remove_table @@ -91,7 +91,7 @@ class Lookups(object): def has_table(self, name): """Check if the lookups contain a table of a given name. - name (unicode): Name of the table. + name (str): Name of the table. RETURNS (bool): Whether a table of that name exists. DOCS: https://spacy.io/api/lookups#has_table @@ -125,7 +125,7 @@ class Lookups(object): """Save the lookups to a directory as lookups.bin. Expects a path to a directory, which will be created if it doesn't exist. - path (unicode / Path): The file path. + path (str / Path): The file path. DOCS: https://spacy.io/api/lookups#to_disk """ @@ -141,7 +141,7 @@ class Lookups(object): """Load lookups from a directory containing a lookups.bin. Will skip loading if the file doesn't exist. - path (unicode / Path): The directory path. + path (str / Path): The directory path. RETURNS (Lookups): The loaded lookups. DOCS: https://spacy.io/api/lookups#from_disk @@ -167,7 +167,7 @@ class Table(OrderedDict): """Initialize a new table from a dict. data (dict): The dictionary. - name (unicode): Optional table name for reference. + name (str): Optional table name for reference. RETURNS (Table): The newly created object. DOCS: https://spacy.io/api/lookups#table.from_dict @@ -179,7 +179,7 @@ class Table(OrderedDict): def __init__(self, name=None, data=None): """Initialize a new table. - name (unicode): Optional table name for reference. + name (str): Optional table name for reference. data (dict): Initial data, used to hint Bloom Filter. RETURNS (Table): The newly created object. @@ -197,7 +197,7 @@ class Table(OrderedDict): def __setitem__(self, key, value): """Set new key/value pair. String keys will be hashed. - key (unicode / int): The key to set. + key (str / int): The key to set. value: The value to set. """ key = get_string_id(key) @@ -208,7 +208,7 @@ class Table(OrderedDict): """Set new key/value pair. String keys will be hashed. Same as table[key] = value. - key (unicode / int): The key to set. + key (str / int): The key to set. value: The value to set. """ self[key] = value @@ -216,7 +216,7 @@ class Table(OrderedDict): def __getitem__(self, key): """Get the value for a given key. String keys will be hashed. - key (unicode / int): The key to get. + key (str / int): The key to get. RETURNS: The value. """ key = get_string_id(key) @@ -225,7 +225,7 @@ class Table(OrderedDict): def get(self, key, default=None): """Get the value for a given key. String keys will be hashed. - key (unicode / int): The key to get. + key (str / int): The key to get. default: The default value to return. RETURNS: The value. """ @@ -235,7 +235,7 @@ class Table(OrderedDict): def __contains__(self, key): """Check whether a key is in the table. String keys will be hashed. - key (unicode / int): The key to check. + key (str / int): The key to check. RETURNS (bool): Whether the key is in the table. """ key = get_string_id(key) diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index ff707a71c..732931380 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -66,7 +66,7 @@ cdef class DependencyMatcher: def __contains__(self, key): """Check whether the matcher contains rules for a match ID. - key (unicode): The match ID. + key (str): The match ID. RETURNS (bool): Whether the matcher contains rules for this match ID. """ return self._normalize_key(key) in self._patterns diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 2bcb82a2a..225eba9a9 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -63,7 +63,7 @@ cdef class Matcher: def __contains__(self, key): """Check whether the matcher contains rules for a match ID. - key (unicode): The match ID. + key (str): The match ID. RETURNS (bool): Whether the matcher contains rules for this match ID. """ return self._normalize_key(key) in self._patterns @@ -97,7 +97,7 @@ cdef class Matcher: number of arguments). The on_match callback becomes an optional keyword argument. - key (unicode): The match ID. + key (str): The match ID. patterns (list): The patterns to add for the given key. on_match (callable): Optional callback executed on match. *_patterns (list): For backwards compatibility: list of patterns to add @@ -138,7 +138,7 @@ cdef class Matcher: """Remove a rule from the matcher. A KeyError is raised if the key does not exist. - key (unicode): The ID of the match rule. + key (str): The ID of the match rule. """ norm_key = self._normalize_key(key) if not norm_key in self._patterns: diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 14cc39787..f7ce44ece 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -70,7 +70,7 @@ cdef class PhraseMatcher: def __contains__(self, key): """Check whether the matcher contains rules for a match ID. - key (unicode): The match ID. + key (str): The match ID. RETURNS (bool): Whether the matcher contains rules for this match ID. DOCS: https://spacy.io/api/phrasematcher#contains @@ -85,7 +85,7 @@ cdef class PhraseMatcher: """Remove a rule from the matcher by match ID. A KeyError is raised if the key does not exist. - key (unicode): The match ID. + key (str): The match ID. DOCS: https://spacy.io/api/phrasematcher#remove """ @@ -159,7 +159,7 @@ cdef class PhraseMatcher: number of arguments). The on_match callback becomes an optional keyword argument. - key (unicode): The match ID. + key (str): The match ID. docs (list): List of `Doc` objects representing match patterns. on_match (callable): Callback executed on match. *_docs (Doc): For backwards compatibility: list of patterns to add diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 0b53b124c..5dcf81ea7 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -198,8 +198,8 @@ cdef class Morphology: """Add a special-case rule to the morphological analyser. Tokens whose tag and orth match the rule will receive the specified properties. - tag (unicode): The part-of-speech tag to key the exception. - orth (unicode): The word-form to key the exception. + tag (str): The part-of-speech tag to key the exception. + orth (str): The word-form to key the exception. """ attrs = dict(attrs) attrs = _normalize_props(attrs) diff --git a/spacy/pipe_analysis.py b/spacy/pipe_analysis.py index 4c0950453..971ebe518 100644 --- a/spacy/pipe_analysis.py +++ b/spacy/pipe_analysis.py @@ -11,7 +11,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True): fulfilled (e.g. if previous components assign the attributes). pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline. - name (unicode): The name of the pipeline component to analyze. + name (str): The name of the pipeline component to analyze. pipe (callable): The pipeline component function to analyze. index (int): The index of the component in the pipeline. warn (bool): Show user warning if problem is found. @@ -125,7 +125,7 @@ def get_assigns_for_attr(pipeline, attr): """Get all pipeline components that assign an attr, e.g. "doc.tensor". pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline. - attr (unicode): The attribute to check. + attr (str): The attribute to check. RETURNS (list): (name, pipeline) tuples of components that assign the attr. """ return _get_feature_for_attr(pipeline, attr, "assigns") @@ -135,7 +135,7 @@ def get_requires_for_attr(pipeline, attr): """Get all pipeline components that require an attr, e.g. "doc.tensor". pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline. - attr (unicode): The attribute to check. + attr (str): The attribute to check. RETURNS (list): (name, pipeline) tuples of components that require the attr. """ return _get_feature_for_attr(pipeline, attr, "requires") diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 58160c2e9..cdacc82f6 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -315,7 +315,7 @@ class EntityRuler(object): """Load the entity ruler from a file. Expects a file containing newline-delimited JSON (JSONL) with one entry per line. - path (unicode / Path): The JSONL file to load. + path (str / Path): The JSONL file to load. **kwargs: Other config paramters, mostly for consistency. RETURNS (EntityRuler): The loaded entity ruler. @@ -351,7 +351,7 @@ class EntityRuler(object): """Save the entity ruler patterns to a directory. The patterns will be saved as newline-delimited JSON (JSONL). - path (unicode / Path): The JSONL file to save. + path (str / Path): The JSONL file to save. **kwargs: Other config paramters, mostly for consistency. DOCS: https://spacy.io/api/entityruler#to_disk diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index 6e9d4197c..622791512 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -50,7 +50,7 @@ def merge_subtokens(doc, label="subtok"): """Merge subtokens into a single token. doc (Doc): The Doc object. - label (unicode): The subtoken dependency label. + label (str): The subtoken dependency label. RETURNS (Doc): The Doc object with merged subtokens. DOCS: https://spacy.io/api/pipeline-functions#merge_subtokens diff --git a/spacy/strings.pyx b/spacy/strings.pyx index a30f11729..9fe5af154 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -152,7 +152,7 @@ cdef class StringStore: def add(self, string): """Add a string to the StringStore. - string (unicode): The string to add. + string (str): The string to add. RETURNS (uint64): The string's hash value. """ if isinstance(string, unicode): @@ -179,7 +179,7 @@ cdef class StringStore: def __contains__(self, string not None): """Check whether a string is in the store. - string (unicode): The string to check. + string (str): The string to check. RETURNS (bool): Whether the store contains the string. """ cdef hash_t key @@ -205,7 +205,7 @@ cdef class StringStore: def __iter__(self): """Iterate over the strings in the store, in order. - YIELDS (unicode): A string in the store. + YIELDS (str): A string in the store. """ cdef int i cdef hash_t key diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 7e75052f7..b628b1171 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -134,7 +134,7 @@ cdef class Tokenizer: def __call__(self, unicode string): """Tokenize a string. - string (unicode): The string to tokenize. + string (str): The string to tokenize. RETURNS (Doc): A container for linguistic annotations. DOCS: https://spacy.io/api/tokenizer#call @@ -147,7 +147,7 @@ cdef class Tokenizer: cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases): """Tokenize according to affix and token_match settings. - string (unicode): The string to tokenize. + string (str): The string to tokenize. RETURNS (Doc): A container for linguistic annotations. """ if len(string) >= (2 ** 30): @@ -527,7 +527,7 @@ cdef class Tokenizer: def find_infix(self, unicode string): """Find internal split points of the string, such as hyphens. - string (unicode): The string to segment. + string (str): The string to segment. RETURNS (list): A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. @@ -542,7 +542,7 @@ cdef class Tokenizer: """Find the length of a prefix that should be segmented from the string, or None if no prefix rules match. - string (unicode): The string to segment. + string (str): The string to segment. RETURNS (int): The length of the prefix if present, otherwise `None`. DOCS: https://spacy.io/api/tokenizer#find_prefix @@ -556,7 +556,7 @@ cdef class Tokenizer: """Find the length of a suffix that should be segmented from the string, or None if no suffix rules match. - string (unicode): The string to segment. + string (str): The string to segment. Returns (int): The length of the suffix if present, otherwise `None`. DOCS: https://spacy.io/api/tokenizer#find_suffix @@ -576,7 +576,7 @@ cdef class Tokenizer: def _validate_special_case(self, chunk, substrings): """Check whether the `ORTH` fields match the string. - string (unicode): The string to specially tokenize. + string (str): The string to specially tokenize. substrings (iterable): A sequence of dicts, where each dict describes a token and its attributes. """ @@ -588,7 +588,7 @@ cdef class Tokenizer: def add_special_case(self, unicode string, substrings): """Add a special-case tokenization rule. - string (unicode): The string to specially tokenize. + string (str): The string to specially tokenize. substrings (iterable): A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. @@ -629,7 +629,7 @@ cdef class Tokenizer: produced are identical to `nlp.tokenizer()` except for whitespace tokens. - string (unicode): The string to tokenize. + string (str): The string to tokenize. RETURNS (list): A list of (pattern_string, token_string) tuples DOCS: https://spacy.io/api/tokenizer#explain diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 0716b2b3d..f6d0dbf4a 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -107,7 +107,7 @@ cdef class Doc: def set_extension(cls, name, **kwargs): """Define a custom attribute which becomes available as `Doc._`. - name (unicode): Name of the attribute to set. + name (str): Name of the attribute to set. default: Optional default value of the attribute. getter (callable): Optional getter function. setter (callable): Optional setter function. @@ -125,7 +125,7 @@ cdef class Doc: def get_extension(cls, name): """Look up a previously registered extension by name. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple. DOCS: https://spacy.io/api/doc#get_extension @@ -136,7 +136,7 @@ cdef class Doc: def has_extension(cls, name): """Check whether an extension has been registered. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (bool): Whether the extension has been registered. DOCS: https://spacy.io/api/doc#has_extension @@ -147,7 +147,7 @@ cdef class Doc: def remove_extension(cls, name): """Remove a previously registered extension. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple of the removed extension. @@ -473,7 +473,7 @@ cdef class Doc: def text(self): """A unicode representation of the document text. - RETURNS (unicode): The original verbatim text of the document. + RETURNS (str): The original verbatim text of the document. """ return "".join(t.text_with_ws for t in self) @@ -482,7 +482,7 @@ cdef class Doc: """An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. - RETURNS (unicode): The original verbatim text of the document. + RETURNS (str): The original verbatim text of the document. """ return self.text @@ -628,7 +628,7 @@ cdef class Doc: @property def lang_(self): - """RETURNS (unicode): Language of the doc's vocabulary, e.g. 'en'.""" + """RETURNS (str): Language of the doc's vocabulary, e.g. 'en'.""" return self.vocab.lang cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1: diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 66e8d8c3e..59323c393 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -33,7 +33,7 @@ cdef class Span: def set_extension(cls, name, **kwargs): """Define a custom attribute which becomes available as `Span._`. - name (unicode): Name of the attribute to set. + name (str): Name of the attribute to set. default: Optional default value of the attribute. getter (callable): Optional getter function. setter (callable): Optional setter function. @@ -51,7 +51,7 @@ cdef class Span: def get_extension(cls, name): """Look up a previously registered extension by name. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple. DOCS: https://spacy.io/api/span#get_extension @@ -62,7 +62,7 @@ cdef class Span: def has_extension(cls, name): """Check whether an extension has been registered. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (bool): Whether the extension has been registered. DOCS: https://spacy.io/api/span#has_extension @@ -73,7 +73,7 @@ cdef class Span: def remove_extension(cls, name): """Remove a previously registered extension. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple of the removed extension. @@ -501,7 +501,7 @@ cdef class Span: @property def text(self): - """RETURNS (unicode): The original verbatim text of the span.""" + """RETURNS (str): The original verbatim text of the span.""" text = self.text_with_ws if self[-1].whitespace_: text = text[:-1] @@ -512,7 +512,7 @@ cdef class Span: """The text content of the span with a trailing whitespace character if the last token has one. - RETURNS (unicode): The text content of the span (with trailing + RETURNS (str): The text content of the span (with trailing whitespace). """ return "".join([t.text_with_ws for t in self]) @@ -688,7 +688,7 @@ cdef class Span: raise NotImplementedError(TempErrors.T007.format(attr="ent_id")) property ent_id_: - """RETURNS (unicode): The (string) entity ID.""" + """RETURNS (str): The (string) entity ID.""" def __get__(self): return self.root.ent_id_ @@ -700,12 +700,12 @@ cdef class Span: """Verbatim text content (identical to `Span.text`). Exists mostly for consistency with other attributes. - RETURNS (unicode): The span's text.""" + RETURNS (str): The span's text.""" return self.text @property def lemma_(self): - """RETURNS (unicode): The span's lemma.""" + """RETURNS (str): The span's lemma.""" return " ".join([t.lemma_ for t in self]).strip() @property @@ -724,7 +724,7 @@ cdef class Span: return "".join([t.text_with_ws for t in self]) property label_: - """RETURNS (unicode): The span's label.""" + """RETURNS (str): The span's label.""" def __get__(self): return self.doc.vocab.strings[self.label] @@ -734,7 +734,7 @@ cdef class Span: raise NotImplementedError(Errors.E129.format(start=self.start, end=self.end, label=label_)) property kb_id_: - """RETURNS (unicode): The named entity's KB ID.""" + """RETURNS (str): The named entity's KB ID.""" def __get__(self): return self.doc.vocab.strings[self.kb_id] diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 2486ed991..0d1e82322 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -36,7 +36,7 @@ cdef class Token: def set_extension(cls, name, **kwargs): """Define a custom attribute which becomes available as `Token._`. - name (unicode): Name of the attribute to set. + name (str): Name of the attribute to set. default: Optional default value of the attribute. getter (callable): Optional getter function. setter (callable): Optional setter function. @@ -54,7 +54,7 @@ cdef class Token: def get_extension(cls, name): """Look up a previously registered extension by name. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple. DOCS: https://spacy.io/api/token#get_extension @@ -65,7 +65,7 @@ cdef class Token: def has_extension(cls, name): """Check whether an extension has been registered. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (bool): Whether the extension has been registered. DOCS: https://spacy.io/api/token#has_extension @@ -76,7 +76,7 @@ cdef class Token: def remove_extension(cls, name): """Remove a previously registered extension. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple of the removed extension. @@ -244,12 +244,12 @@ cdef class Token: @property def text(self): - """RETURNS (unicode): The original verbatim text of the token.""" + """RETURNS (str): The original verbatim text of the token.""" return self.orth_ @property def text_with_ws(self): - """RETURNS (unicode): The text content of the span (with trailing + """RETURNS (str): The text content of the span (with trailing whitespace). """ cdef unicode orth = self.vocab.strings[self.c.lex.orth] @@ -740,7 +740,7 @@ cdef class Token: self.c.ent_type = ent_type property ent_type_: - """RETURNS (unicode): Named entity type.""" + """RETURNS (str): Named entity type.""" def __get__(self): return self.vocab.strings[self.c.ent_type] @@ -763,7 +763,7 @@ cdef class Token: and "" means no entity tag is set. "B" with an empty ent_type means that the token is blocked from further processing by NER. - RETURNS (unicode): IOB code of named entity tag. + RETURNS (str): IOB code of named entity tag. """ iob_strings = ("", "I", "O", "B") return iob_strings[self.c.ent_iob] @@ -779,7 +779,7 @@ cdef class Token: self.c.ent_id = key property ent_id_: - """RETURNS (unicode): ID of the entity the token is an instance of, + """RETURNS (str): ID of the entity the token is an instance of, if any. """ def __get__(self): @@ -797,7 +797,7 @@ cdef class Token: self.c.ent_kb_id = ent_kb_id property ent_kb_id_: - """RETURNS (unicode): Named entity KB ID.""" + """RETURNS (str): Named entity KB ID.""" def __get__(self): return self.vocab.strings[self.c.ent_kb_id] @@ -806,12 +806,12 @@ cdef class Token: @property def whitespace_(self): - """RETURNS (unicode): The trailing whitespace character, if present.""" + """RETURNS (str): The trailing whitespace character, if present.""" return " " if self.c.spacy else "" @property def orth_(self): - """RETURNS (unicode): Verbatim text content (identical to + """RETURNS (str): Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. """ @@ -819,13 +819,13 @@ cdef class Token: @property def lower_(self): - """RETURNS (unicode): The lowercase token text. Equivalent to + """RETURNS (str): The lowercase token text. Equivalent to `Token.text.lower()`. """ return self.vocab.strings[self.c.lex.lower] property norm_: - """RETURNS (unicode): The token's norm, i.e. a normalised form of the + """RETURNS (str): The token's norm, i.e. a normalised form of the token text. Usually set in the language's tokenizer exceptions or norm exceptions. """ @@ -837,34 +837,34 @@ cdef class Token: @property def shape_(self): - """RETURNS (unicode): Transform of the tokens's string, to show + """RETURNS (str): Transform of the tokens's string, to show orthographic features. For example, "Xxxx" or "dd". """ return self.vocab.strings[self.c.lex.shape] @property def prefix_(self): - """RETURNS (unicode): A length-N substring from the start of the token. + """RETURNS (str): A length-N substring from the start of the token. Defaults to `N=1`. """ return self.vocab.strings[self.c.lex.prefix] @property def suffix_(self): - """RETURNS (unicode): A length-N substring from the end of the token. + """RETURNS (str): A length-N substring from the end of the token. Defaults to `N=3`. """ return self.vocab.strings[self.c.lex.suffix] @property def lang_(self): - """RETURNS (unicode): Language of the parent document's vocabulary, + """RETURNS (str): Language of the parent document's vocabulary, e.g. 'en'. """ return self.vocab.strings[self.c.lex.lang] property lemma_: - """RETURNS (unicode): The token lemma, i.e. the base form of the word, + """RETURNS (str): The token lemma, i.e. the base form of the word, with no inflectional suffixes. """ def __get__(self): @@ -877,7 +877,7 @@ cdef class Token: self.c.lemma = self.vocab.strings.add(lemma_) property pos_: - """RETURNS (unicode): Coarse-grained part-of-speech tag.""" + """RETURNS (str): Coarse-grained part-of-speech tag.""" def __get__(self): return parts_of_speech.NAMES[self.c.pos] @@ -885,7 +885,7 @@ cdef class Token: self.c.pos = parts_of_speech.IDS[pos_name] property tag_: - """RETURNS (unicode): Fine-grained part-of-speech tag.""" + """RETURNS (str): Fine-grained part-of-speech tag.""" def __get__(self): return self.vocab.strings[self.c.tag] @@ -893,7 +893,7 @@ cdef class Token: self.tag = self.vocab.strings.add(tag) property dep_: - """RETURNS (unicode): The syntactic dependency label.""" + """RETURNS (str): The syntactic dependency label.""" def __get__(self): return self.vocab.strings[self.c.dep] diff --git a/spacy/util.py b/spacy/util.py index 41af881c9..fc5837755 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -58,7 +58,7 @@ def lang_class_is_loaded(lang): loaded lazily, to avoid expensive setup code associated with the language data. - lang (unicode): Two-letter language code, e.g. 'en'. + lang (str): Two-letter language code, e.g. 'en'. RETURNS (bool): Whether a Language class has been loaded. """ return lang in registry.languages @@ -67,7 +67,7 @@ def lang_class_is_loaded(lang): def get_lang_class(lang): """Import and load a Language class. - lang (unicode): Two-letter language code, e.g. 'en'. + lang (str): Two-letter language code, e.g. 'en'. RETURNS (Language): Language class. """ # Check if language is registered / entry point is available @@ -85,7 +85,7 @@ def get_lang_class(lang): def set_lang_class(name, cls): """Set a custom Language class name that can be loaded via get_lang_class. - name (unicode): Name of Language class. + name (str): Name of Language class. cls (Language): Language class. """ registry.languages.register(name, func=cls) @@ -107,7 +107,7 @@ def load_language_data(path): """Load JSON language data using the given path as a base. If the provided path isn't present, will attempt to load a gzipped version before giving up. - path (unicode / Path): The data to load. + path (str / Path): The data to load. RETURNS: The loaded data. """ path = ensure_path(path) @@ -128,7 +128,7 @@ def get_module_path(module): def load_model(name, **overrides): """Load a model from a package or data path. - name (unicode): Package name or model path. + name (str): Package name or model path. **overrides: Specific overrides, like pipeline components to disable. RETURNS (Language): `Language` class with the loaded model. """ @@ -202,7 +202,7 @@ def load_model_from_init_py(init_file, **overrides): """Helper function to use in the `load()` method of a model package's __init__.py. - init_file (unicode): Path to model's __init__.py, i.e. `__file__`. + init_file (str): Path to model's __init__.py, i.e. `__file__`. **overrides: Specific overrides, like pipeline components to disable. RETURNS (Language): `Language` class with loaded model. """ @@ -227,8 +227,8 @@ def get_package_version(name): """Get the version of an installed package. Typically used to get model package versions. - name (unicode): The name of the installed Python package. - RETURNS (unicode / None): The version or None if package not installed. + name (str): The name of the installed Python package. + RETURNS (str / None): The version or None if package not installed. """ try: return importlib_metadata.version(name) @@ -338,7 +338,7 @@ def get_model_config(path): def is_package(name): """Check if string maps to a package installed via pip. - name (unicode): Name of package. + name (str): Name of package. RETURNS (bool): True if installed package, False if not. """ try: @@ -351,7 +351,7 @@ def is_package(name): def get_package_path(name): """Get the path to an installed package. - name (unicode): Package name. + name (str): Package name. RETURNS (Path): Path to installed package. """ name = name.lower() # use lowercase version to be safe @@ -526,8 +526,8 @@ def expand_exc(excs, search, replace): For example, to add additional versions with typographic apostrophes. excs (dict): Tokenizer exceptions. - search (unicode): String to find and replace. - replace (unicode): Replacement. + search (str): String to find and replace. + replace (str): Replacement. RETURNS (dict): Combined tokenizer exceptions. """ @@ -761,8 +761,8 @@ def from_disk(path, readers, exclude): def import_file(name, loc): """Import module from a file. Used to load models from a directory. - name (unicode): Name of module to load. - loc (unicode / Path): Path to the file. + name (str): Name of module to load. + loc (str / Path): Path to the file. RETURNS: The loaded module. """ loc = str(loc) @@ -777,8 +777,8 @@ def minify_html(html): Disclaimer: NOT a general-purpose solution, only removes indentation and newlines. - html (unicode): Markup to minify. - RETURNS (unicode): "Minified" HTML. + html (str): Markup to minify. + RETURNS (str): "Minified" HTML. """ return html.strip().replace(" ", "").replace("\n", "") @@ -787,8 +787,8 @@ def escape_html(text): """Replace <, >, &, " with their HTML encoded representation. Intended to prevent HTML errors in rendered displaCy markup. - text (unicode): The original text. - RETURNS (unicode): Equivalent text to be safely used within HTML. + text (str): The original text. + RETURNS (str): Equivalent text to be safely used within HTML. """ text = text.replace("&", "&") text = text.replace("<", "<") diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index e100ae915..0ed2462c6 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -57,7 +57,7 @@ cdef class Vectors: shape (tuple): Size of the table, as (# entries, # columns) data (numpy.ndarray): The vector data. keys (iterable): A sequence of keys, aligned with the data. - name (unicode): A name to identify the vectors table. + name (str): A name to identify the vectors table. RETURNS (Vectors): The newly created object. DOCS: https://spacy.io/api/vectors#init @@ -237,7 +237,7 @@ cdef class Vectors: def find(self, *, key=None, keys=None, row=None, rows=None): """Look up one or more keys by row, or vice versa. - key (unicode / int): Find the row that the given key points to. + key (str / int): Find the row that the given key points to. Returns int, -1 if missing. keys (iterable): Find rows that the keys point to. Returns ndarray. @@ -352,7 +352,7 @@ cdef class Vectors: def to_disk(self, path, **kwargs): """Save the current state to a directory. - path (unicode / Path): A path to a directory, which will be created if + path (str / Path): A path to a directory, which will be created if it doesn't exists. DOCS: https://spacy.io/api/vectors#to_disk @@ -372,7 +372,7 @@ cdef class Vectors: """Loads state from a directory. Modifies the object in place and returns it. - path (unicode / Path): Directory path, string or Path-like object. + path (str / Path): Directory path, string or Path-like object. RETURNS (Vectors): The modified object. DOCS: https://spacy.io/api/vectors#from_disk diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index a1929559f..ed37f6e98 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -41,7 +41,7 @@ cdef class Vocab: strings (StringStore): StringStore that maps strings to integers, and vice versa. lookups (Lookups): Container for large lookup tables and dictionaries. - name (unicode): Optional name to identify the vectors table. + name (str): Optional name to identify the vectors table. RETURNS (Vocab): The newly constructed object. """ lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} @@ -97,7 +97,7 @@ cdef class Vocab: See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`, `Token.check_flag`. - flag_getter (callable): A function `f(unicode) -> bool`, to get the + flag_getter (callable): A function `f(str) -> bool`, to get the flag value. flag_id (int): An integer between 1 and 63 (inclusive), specifying the bit at which the flag will be stored. If -1, the lowest @@ -187,7 +187,7 @@ cdef class Vocab: def __contains__(self, key): """Check whether the string or int key has an entry in the vocabulary. - string (unicode): The ID string. + string (str): The ID string. RETURNS (bool) Whether the string has an entry in the vocabulary. DOCS: https://spacy.io/api/vocab#contains diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md index feb167a9d..39148e476 100644 --- a/website/docs/api/lexeme.md +++ b/website/docs/api/lexeme.md @@ -125,21 +125,21 @@ The L2 norm of the lexeme's vector representation. | Name | Type | Description | | -------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `vocab` | `Vocab` | The lexeme's vocabulary. | -| `text` | unicode | Verbatim text content. | +| `text` | str | Verbatim text content. | | `orth` | int | ID of the verbatim text content. | -| `orth_` | unicode | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. | +| `orth_` | str | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. | | `rank` | int | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors. | | `flags` | int | Container of the lexeme's binary flags. | | `norm` | int | The lexemes's norm, i.e. a normalized form of the lexeme text. | -| `norm_` | unicode | The lexemes's norm, i.e. a normalized form of the lexeme text. | +| `norm_` | str | The lexemes's norm, i.e. a normalized form of the lexeme text. | | `lower` | int | Lowercase form of the word. | -| `lower_` | unicode | Lowercase form of the word. | +| `lower_` | str | Lowercase form of the word. | | `shape` | int | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | -| `shape_` | unicode | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | +| `shape_` | str | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | | `prefix` | int | Length-N substring from the start of the word. Defaults to `N=1`. | -| `prefix_` | unicode | Length-N substring from the start of the word. Defaults to `N=1`. | +| `prefix_` | str | Length-N substring from the start of the word. Defaults to `N=1`. | | `suffix` | int | Length-N substring from the end of the word. Defaults to `N=3`. | -| `suffix_` | unicode | Length-N substring from the start of the word. Defaults to `N=3`. | +| `suffix_` | str | Length-N substring from the start of the word. Defaults to `N=3`. | | `is_alpha` | bool | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. | | `is_ascii` | bool | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. | | `is_digit` | bool | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. | @@ -159,7 +159,7 @@ The L2 norm of the lexeme's vector representation. | `is_oov` | bool | Is the lexeme out-of-vocabulary? | | `is_stop` | bool | Is the lexeme part of a "stop list"? | | `lang` | int | Language of the parent vocabulary. | -| `lang_` | unicode | Language of the parent vocabulary. | +| `lang_` | str | Language of the parent vocabulary. | | `prob` | float | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). | | `cluster` | int | Brown cluster ID. | | `sentiment` | float | A scalar value indicating the positivity or negativity of the lexeme. | diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index e024ab54a..b851f6882 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -27,7 +27,7 @@ Create the vocabulary. | `tag_map` | dict | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. | | `lemmatizer` | object | A lemmatizer. Defaults to `None`. | | `strings` | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. | -| `vectors_name` 2.2 | unicode | A name to identify the vectors table. | +| `vectors_name` 2.2 | str | A name to identify the vectors table. | | **RETURNS** | `Vocab` | The newly constructed object. | ## Vocab.\_\_len\_\_ {#len tag="method"} @@ -91,10 +91,10 @@ given string, you need to look it up in > assert oov not in nlp.vocab > ``` -| Name | Type | Description | -| ----------- | ------- | -------------------------------------------------- | -| `string` | unicode | The ID string. | -| **RETURNS** | bool | Whether the string has an entry in the vocabulary. | +| Name | Type | Description | +| ----------- | ---- | -------------------------------------------------- | +| `string` | str | The ID string. | +| **RETURNS** | bool | Whether the string has an entry in the vocabulary. | ## Vocab.add_flag {#add_flag tag="method"} @@ -117,7 +117,7 @@ using `token.check_flag(flag_id)`. | Name | Type | Description | | ------------- | ---- | ----------------------------------------------------------------------------------------------------------------------------------------------- | -| `flag_getter` | dict | A function `f(unicode) -> bool`, to get the flag value. | +| `flag_getter` | dict | A function `f(str) -> bool`, to get the flag value. | | `flag_id` | int | An integer between 1 and 63 (inclusive), specifying the bit at which the flag will be stored. If `-1`, the lowest available bit will be chosen. | | **RETURNS** | int | The integer ID by which the flag value can be checked. | @@ -227,10 +227,10 @@ Save the current state to a directory. > nlp.vocab.to_disk("/path/to/vocab") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## Vocab.from_disk {#from_disk tag="method" new="2"} @@ -243,11 +243,11 @@ Loads state from a directory. Modifies the object in place and returns it. > vocab = Vocab().from_disk("/path/to/vocab") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Vocab` | The modified `Vocab` object. | +| Name | Type | Description | +| ----------- | ------------ | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Vocab` | The modified `Vocab` object. | ## Vocab.to_bytes {#to_bytes tag="method"} diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 5f47bd2e3..a84399312 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -157,19 +157,19 @@ The available token pattern keys correspond to a number of [`Token` attributes](/api/token#attributes). The supported attributes for rule-based matching are: -| Attribute | Type |  Description | -| -------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------ | -| `ORTH` | unicode | The exact verbatim text of a token. | -| `TEXT` 2.1 | unicode | The exact verbatim text of a token. | -| `LOWER` | unicode | The lowercase form of the token text. | -|  `LENGTH` | int | The length of the token text. | -|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. | -|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. | -|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. | -|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. | -|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | unicode | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. | -| `ENT_TYPE` | unicode | The token's entity label. | -| `_` 2.1 | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). | +| Attribute | Type |  Description | +| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ | +| `ORTH` | str | The exact verbatim text of a token. | +| `TEXT` 2.1 | str | The exact verbatim text of a token. | +| `LOWER` | str | The lowercase form of the token text. | +|  `LENGTH` | int | The length of the token text. | +|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. | +|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. | +|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. | +|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. | +|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. | +| `ENT_TYPE` | str | The token's entity label. | +| `_` 2.1 | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). | @@ -1101,21 +1101,28 @@ powerful model packages with binary weights _and_ rules included! ### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"} -When using a large amount of **phrase patterns** (roughly > 10000) it's useful to understand how the `add_patterns` function of the EntityRuler works. For each **phrase pattern**, -the EntityRuler calls the nlp object to construct a doc object. This happens in case you try -to add the EntityRuler at the end of an existing pipeline with, for example, a POS tagger and want to -extract matches based on the pattern's POS signature. +When using a large amount of **phrase patterns** (roughly > 10000) it's useful +to understand how the `add_patterns` function of the EntityRuler works. For each +**phrase pattern**, the EntityRuler calls the nlp object to construct a doc +object. This happens in case you try to add the EntityRuler at the end of an +existing pipeline with, for example, a POS tagger and want to extract matches +based on the pattern's POS signature. -In this case you would pass a config value of `phrase_matcher_attr="POS"` for the EntityRuler. +In this case you would pass a config value of `phrase_matcher_attr="POS"` for +the EntityRuler. -Running the full language pipeline across every pattern in a large list scales linearly and can therefore take a long time on large amounts of phrase patterns. +Running the full language pipeline across every pattern in a large list scales +linearly and can therefore take a long time on large amounts of phrase patterns. -As of spaCy 2.2.4 the `add_patterns` function has been refactored to use nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with 5,000-100,000 phrase patterns respectively. +As of spaCy 2.2.4 the `add_patterns` function has been refactored to use +nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with +5,000-100,000 phrase patterns respectively. -Even with this speedup (but especially if you're using an older version) the `add_patterns` function can still take a long time. +Even with this speedup (but especially if you're using an older version) the +`add_patterns` function can still take a long time. -An easy workaround to make this function run faster is disabling the other language pipes -while adding the phrase patterns. +An easy workaround to make this function run faster is disabling the other +language pipes while adding the phrase patterns. ```python entityruler = EntityRuler(nlp) From 262d306eaa5a8715ca5905c8fde341ba65771d09 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 24 May 2020 17:23:00 +0200 Subject: [PATCH 19/39] unicode -> str consistency --- website/docs/api/cli.md | 8 +- website/docs/api/cython-classes.md | 2 +- website/docs/api/dependencyparser.md | 16 +- website/docs/api/doc.md | 50 +++--- website/docs/api/entitylinker.md | 18 +-- website/docs/api/entityrecognizer.md | 16 +- website/docs/api/entityruler.md | 27 ++-- website/docs/api/goldcorpus.md | 10 +- website/docs/api/goldparse.md | 5 +- website/docs/api/kb.md | 170 +++++++++++---------- website/docs/api/language.md | 79 +++++----- website/docs/api/lemmatizer.md | 24 +-- website/docs/api/lookups.md | 50 +++--- website/docs/api/matcher.md | 24 +-- website/docs/api/phrasematcher.md | 16 +- website/docs/api/pipeline-functions.md | 10 +- website/docs/api/sentencizer.md | 14 +- website/docs/api/span.md | 40 ++--- website/docs/api/stringstore.md | 46 +++--- website/docs/api/tagger.md | 26 ++-- website/docs/api/textcategorizer.md | 18 +-- website/docs/api/token.md | 160 +++++++++---------- website/docs/api/tokenizer.md | 88 +++++------ website/docs/api/top-level.md | 126 +++++++-------- website/docs/api/vectors.md | 20 +-- website/docs/usage/linguistic-features.md | 18 +-- website/docs/usage/processing-pipelines.md | 20 +-- website/docs/usage/saving-loading.md | 12 +- website/docs/usage/visualizers.md | 12 +- 29 files changed, 564 insertions(+), 561 deletions(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index f067ba5a7..d507e13ec 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -504,10 +504,10 @@ tokenization can be provided. > srsly.write_jsonl("/path/to/text.jsonl", data) > ``` -| Key | Type | Description | -| -------- | ------- | ---------------------------------------------------------- | -| `text` | unicode | The raw input text. Is not required if `tokens` available. | -| `tokens` | list | Optional tokenization, one string per token. | +| Key | Type | Description | +| -------- | ---- | ---------------------------------------------------------- | +| `text` | str | The raw input text. Is not required if `tokens` available. | +| `tokens` | list | Optional tokenization, one string per token. | ```json ### Example diff --git a/website/docs/api/cython-classes.md b/website/docs/api/cython-classes.md index 77d6fdd10..9dea04284 100644 --- a/website/docs/api/cython-classes.md +++ b/website/docs/api/cython-classes.md @@ -170,7 +170,7 @@ vocabulary. | Name | Type | Description | | ----------- | ---------------- | ------------------------------------------------------------------------------------------- | | `mem` | `cymem.Pool` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. | -| `string` | unicode | The string of the word to look up. | +| `string` | str | The string of the word to look up. | | **RETURNS** | `const LexemeC*` | The lexeme in the vocabulary. | ### Vocab.get_by_orth {#vocab_get_by_orth tag="method"} diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index df0df3e38..0980dc2e0 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -229,9 +229,9 @@ Add a new label to the pipe. > parser.add_label("MY_LABEL") > ``` -| Name | Type | Description | -| ------- | ------- | ----------------- | -| `label` | unicode | The label to add. | +| Name | Type | Description | +| ------- | ---- | ----------------- | +| `label` | str | The label to add. | ## DependencyParser.to_disk {#to_disk tag="method"} @@ -244,10 +244,10 @@ Serialize the pipe to disk. > parser.to_disk("/path/to/parser") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## DependencyParser.from_disk {#from_disk tag="method"} @@ -262,7 +262,7 @@ Load the pipe from disk. Modifies the object in place and returns it. | Name | Type | Description | | ----------- | ------------------ | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. | diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index ab85c1deb..75491358d 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -123,7 +123,7 @@ details, see the documentation on | Name | Type | Description | | --------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------- | -| `name` | unicode | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `doc._.my_attr`. | +| `name` | str | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `doc._.my_attr`. | | `default` | - | Optional default value of the attribute if no getter or method is defined. | | `method` | callable | Set a custom method on the object, for example `doc._.compare(other_doc)`. | | `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. | @@ -145,10 +145,10 @@ Look up a previously registered extension by name. Returns a 4-tuple > assert extension == (False, None, None, None) > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------------------------- | -| `name` | unicode | Name of the extension. | -| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | +| Name | Type | Description | +| ----------- | ----- | ------------------------------------------------------------- | +| `name` | str | Name of the extension. | +| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | ## Doc.has_extension {#has_extension tag="classmethod" new="2"} @@ -162,10 +162,10 @@ Check whether an extension has been registered on the `Doc` class. > assert Doc.has_extension('has_city') > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------ | -| `name` | unicode | Name of the extension to check. | -| **RETURNS** | bool | Whether the extension has been registered. | +| Name | Type | Description | +| ----------- | ---- | ------------------------------------------ | +| `name` | str | Name of the extension to check. | +| **RETURNS** | bool | Whether the extension has been registered. | ## Doc.remove_extension {#remove_extension tag="classmethod" new="2.0.12"} @@ -180,10 +180,10 @@ Remove a previously registered extension. > assert not Doc.has_extension('has_city') > ``` -| Name | Type | Description | -| ----------- | ------- | --------------------------------------------------------------------- | -| `name` | unicode | Name of the extension. | -| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | +| Name | Type | Description | +| ----------- | ----- | --------------------------------------------------------------------- | +| `name` | str | Name of the extension. | +| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | ## Doc.char_span {#char_span tag="method" new="2"} @@ -368,10 +368,10 @@ Save the current state to a directory. > doc.to_disk("/path/to/doc") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## Doc.from_disk {#from_disk tag="method" new="2"} @@ -385,11 +385,11 @@ Loads state from a directory. Modifies the object in place and returns it. > doc = Doc(Vocab()).from_disk("/path/to/doc") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Doc` | The modified `Doc` object. | +| Name | Type | Description | +| ----------- | ------------ | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Doc` | The modified `Doc` object. | ## Doc.to_bytes {#to_bytes tag="method"} @@ -648,15 +648,15 @@ The L2 norm of the document's vector representation. | Name | Type | Description | | --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `text` | unicode | A unicode representation of the document text. | -| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. | +| `text` | str | A unicode representation of the document text. | +| `text_with_ws` | str | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. | | `mem` | `Pool` | The document's local memory heap, for all C data it owns. | | `vocab` | `Vocab` | The store of lexical types. | | `tensor` 2 | `ndarray` | Container for dense vector representations. | | `cats` 2 | dictionary | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. | | `user_data` | - | A generic storage area, for user custom data. | | `lang` 2.1 | int | Language of the document's vocabulary. | -| `lang_` 2.1 | unicode | Language of the document's vocabulary. | +| `lang_` 2.1 | str | Language of the document's vocabulary. | | `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. | | `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. | | `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. | diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index a9d6a31a5..d7f25ed56 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -258,10 +258,10 @@ Serialize the pipe to disk. > entity_linker.to_disk("/path/to/entity_linker") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## EntityLinker.from_disk {#from_disk tag="method"} @@ -274,11 +274,11 @@ Load the pipe from disk. Modifies the object in place and returns it. > entity_linker.from_disk("/path/to/entity_linker") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. | +| Name | Type | Description | +| ----------- | -------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 9a2766c07..1d0c1de3a 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -230,9 +230,9 @@ Add a new label to the pipe. > ner.add_label("MY_LABEL") > ``` -| Name | Type | Description | -| ------- | ------- | ----------------- | -| `label` | unicode | The label to add. | +| Name | Type | Description | +| ------- | ---- | ----------------- | +| `label` | str | The label to add. | ## EntityRecognizer.to_disk {#to_disk tag="method"} @@ -245,10 +245,10 @@ Serialize the pipe to disk. > ner.to_disk("/path/to/ner") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## EntityRecognizer.from_disk {#from_disk tag="method"} @@ -263,7 +263,7 @@ Load the pipe from disk. Modifies the object in place and returns it. | Name | Type | Description | | ----------- | ------------------ | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. | diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index 0fd24897d..7bee3a77a 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -72,10 +72,10 @@ Whether a label is present in the patterns. > assert not "PERSON" in ruler > ``` -| Name | Type | Description | -| ----------- | ------- | -------------------------------------------- | -| `label` | unicode | The label to check. | -| **RETURNS** | bool | Whether the entity ruler contains the label. | +| Name | Type | Description | +| ----------- | ---- | -------------------------------------------- | +| `label` | str | The label to check. | +| **RETURNS** | bool | Whether the entity ruler contains the label. | ## EntityRuler.\_\_call\_\_ {#call tag="method"} @@ -83,8 +83,9 @@ Find matches in the `Doc` and add them to the `doc.ents`. Typically, this happens automatically after the component has been added to the pipeline using [`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized with `overwrite_ents=True`, existing entities will be replaced if they overlap -with the matches. When matches overlap in a Doc, the entity ruler prioritizes longer -patterns over shorter, and if equal the match occuring first in the Doc is chosen. +with the matches. When matches overlap in a Doc, the entity ruler prioritizes +longer patterns over shorter, and if equal the match occuring first in the Doc +is chosen. > #### Example > @@ -139,9 +140,9 @@ only the patterns are saved as JSONL. If a directory name is provided, a > ruler.to_disk("/path/to/entity_ruler") # saves patterns and config > ``` -| Name | Type | Description | -| ------ | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| ------ | ------------ | ----------------------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | ## EntityRuler.from_disk {#from_disk tag="method"} @@ -158,10 +159,10 @@ configuration. > ruler.from_disk("/path/to/entity_ruler") # loads patterns and config > ``` -| Name | Type | Description | -| ----------- | ---------------- | ---------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. | -| **RETURNS** | `EntityRuler` | The modified `EntityRuler` object. | +| Name | Type | Description | +| ----------- | ------------- | ---------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. | +| **RETURNS** | `EntityRuler` | The modified `EntityRuler` object. | ## EntityRuler.to_bytes {#to_bytes tag="method"} diff --git a/website/docs/api/goldcorpus.md b/website/docs/api/goldcorpus.md index a18ef4d32..7767b28bd 100644 --- a/website/docs/api/goldcorpus.md +++ b/website/docs/api/goldcorpus.md @@ -17,8 +17,8 @@ Create a `GoldCorpus`. IF the input data is an iterable, each item should be a [`gold.read_json_file`](https://github.com/explosion/spaCy/tree/master/spacy/gold.pyx) for further details. -| Name | Type | Description | -| ----------- | --------------------------- | ------------------------------------------------------------ | -| `train` | unicode / `Path` / iterable | Training data, as a path (file or directory) or iterable. | -| `dev` | unicode / `Path` / iterable | Development data, as a path (file or directory) or iterable. | -| **RETURNS** | `GoldCorpus` | The newly constructed object. | +| Name | Type | Description | +| ----------- | ----------------------- | ------------------------------------------------------------ | +| `train` | str / `Path` / iterable | Training data, as a path (file or directory) or iterable. | +| `dev` | str / `Path` / iterable | Development data, as a path (file or directory) or iterable. | +| **RETURNS** | `GoldCorpus` | The newly constructed object. | diff --git a/website/docs/api/goldparse.md b/website/docs/api/goldparse.md index 1ef6f0362..2f841eedd 100644 --- a/website/docs/api/goldparse.md +++ b/website/docs/api/goldparse.md @@ -62,7 +62,8 @@ Whether the provided syntactic annotations form a projective dependency tree. Convert a list of Doc objects into the [JSON-serializable format](/api/annotation#json-input) used by the -[`spacy train`](/api/cli#train) command. Each input doc will be treated as a 'paragraph' in the output doc. +[`spacy train`](/api/cli#train) command. Each input doc will be treated as a +'paragraph' in the output doc. > #### Example > @@ -160,7 +161,7 @@ single-token entity. | ----------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | | `doc` | `Doc` | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. | | `entities` | iterable | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. | -| **RETURNS** | list | Unicode strings, describing the [BILUO](/api/annotation#biluo) tags. | +| **RETURNS** | list | str strings, describing the [BILUO](/api/annotation#biluo) tags. | ### gold.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"} diff --git a/website/docs/api/kb.md b/website/docs/api/kb.md index eeba85e84..f088815fd 100644 --- a/website/docs/api/kb.md +++ b/website/docs/api/kb.md @@ -1,16 +1,19 @@ --- title: KnowledgeBase -teaser: A storage class for entities and aliases of a specific knowledge base (ontology) +teaser: + A storage class for entities and aliases of a specific knowledge base + (ontology) tag: class source: spacy/kb.pyx new: 2.2 --- -The `KnowledgeBase` object provides a method to generate [`Candidate`](/api/kb/#candidate_init) -objects, which are plausible external identifiers given a certain textual mention. -Each such `Candidate` holds information from the relevant KB entities, -such as its frequency in text and possible aliases. -Each entity in the knowledge base also has a pretrained entity vector of a fixed size. +The `KnowledgeBase` object provides a method to generate +[`Candidate`](/api/kb/#candidate_init) objects, which are plausible external +identifiers given a certain textual mention. Each such `Candidate` holds +information from the relevant KB entities, such as its frequency in text and +possible aliases. Each entity in the knowledge base also has a pretrained entity +vector of a fixed size. ## KnowledgeBase.\_\_init\_\_ {#init tag="method"} @@ -24,25 +27,25 @@ Create the knowledge base. > kb = KnowledgeBase(vocab=vocab, entity_vector_length=64) > ``` -| Name | Type | Description | -| ----------------------- | ---------------- | ----------------------------------------- | -| `vocab` | `Vocab` | A `Vocab` object. | -| `entity_vector_length` | int | Length of the fixed-size entity vectors. | -| **RETURNS** | `KnowledgeBase` | The newly constructed object. | - +| Name | Type | Description | +| ---------------------- | --------------- | ---------------------------------------- | +| `vocab` | `Vocab` | A `Vocab` object. | +| `entity_vector_length` | int | Length of the fixed-size entity vectors. | +| **RETURNS** | `KnowledgeBase` | The newly constructed object. | ## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"} The length of the fixed-size entity vectors in the knowledge base. -| Name | Type | Description | -| ----------- | ---- | ----------------------------------------- | -| **RETURNS** | int | Length of the fixed-size entity vectors. | +| Name | Type | Description | +| ----------- | ---- | ---------------------------------------- | +| **RETURNS** | int | Length of the fixed-size entity vectors. | ## KnowledgeBase.add_entity {#add_entity tag="method"} -Add an entity to the knowledge base, specifying its corpus frequency -and entity vector, which should be of length [`entity_vector_length`](/api/kb#entity_vector_length). +Add an entity to the knowledge base, specifying its corpus frequency and entity +vector, which should be of length +[`entity_vector_length`](/api/kb#entity_vector_length). > #### Example > @@ -51,16 +54,16 @@ and entity vector, which should be of length [`entity_vector_length`](/api/kb#en > kb.add_entity(entity="Q463035", freq=111, entity_vector=vector2) > ``` -| Name | Type | Description | -| --------------- | ------------- | ------------------------------------------------- | -| `entity` | unicode | The unique entity identifier | -| `freq` | float | The frequency of the entity in a typical corpus | -| `entity_vector` | vector | The pretrained vector of the entity | +| Name | Type | Description | +| --------------- | ------ | ----------------------------------------------- | +| `entity` | str | The unique entity identifier | +| `freq` | float | The frequency of the entity in a typical corpus | +| `entity_vector` | vector | The pretrained vector of the entity | ## KnowledgeBase.set_entities {#set_entities tag="method"} -Define the full list of entities in the knowledge base, specifying the corpus frequency -and entity vector for each entity. +Define the full list of entities in the knowledge base, specifying the corpus +frequency and entity vector for each entity. > #### Example > @@ -68,18 +71,19 @@ and entity vector for each entity. > kb.set_entities(entity_list=["Q42", "Q463035"], freq_list=[32, 111], vector_list=[vector1, vector2]) > ``` -| Name | Type | Description | -| ------------- | ------------- | ------------------------------------------------- | -| `entity_list` | iterable | List of unique entity identifiers | -| `freq_list` | iterable | List of entity frequencies | -| `vector_list` | iterable | List of entity vectors | +| Name | Type | Description | +| ------------- | -------- | --------------------------------- | +| `entity_list` | iterable | List of unique entity identifiers | +| `freq_list` | iterable | List of entity frequencies | +| `vector_list` | iterable | List of entity vectors | ## KnowledgeBase.add_alias {#add_alias tag="method"} -Add an alias or mention to the knowledge base, specifying its potential KB identifiers -and their prior probabilities. The entity identifiers should refer to entities previously -added with [`add_entity`](/api/kb#add_entity) or [`set_entities`](/api/kb#set_entities). -The sum of the prior probabilities should not exceed 1. +Add an alias or mention to the knowledge base, specifying its potential KB +identifiers and their prior probabilities. The entity identifiers should refer +to entities previously added with [`add_entity`](/api/kb#add_entity) or +[`set_entities`](/api/kb#set_entities). The sum of the prior probabilities +should not exceed 1. > #### Example > @@ -87,11 +91,11 @@ The sum of the prior probabilities should not exceed 1. > kb.add_alias(alias="Douglas", entities=["Q42", "Q463035"], probabilities=[0.6, 0.3]) > ``` -| Name | Type | Description | -| -------------- | ------------- | -------------------------------------------------- | -| `alias` | unicode | The textual mention or alias | -| `entities` | iterable | The potential entities that the alias may refer to | -| `probabilities`| iterable | The prior probabilities of each entity | +| Name | Type | Description | +| --------------- | -------- | -------------------------------------------------- | +| `alias` | str | The textual mention or alias | +| `entities` | iterable | The potential entities that the alias may refer to | +| `probabilities` | iterable | The prior probabilities of each entity | ## KnowledgeBase.\_\_len\_\_ {#len tag="method"} @@ -117,9 +121,9 @@ Get a list of all entity IDs in the knowledge base. > all_entities = kb.get_entity_strings() > ``` -| Name | Type | Description | -| ----------- | ---- | --------------------------------------------- | -| **RETURNS** | list | The list of entities in the knowledge base. | +| Name | Type | Description | +| ----------- | ---- | ------------------------------------------- | +| **RETURNS** | list | The list of entities in the knowledge base. | ## KnowledgeBase.get_size_aliases {#get_size_aliases tag="method"} @@ -131,9 +135,9 @@ Get the total number of aliases in the knowledge base. > total_aliases = kb.get_size_aliases() > ``` -| Name | Type | Description | -| ----------- | ---- | --------------------------------------------- | -| **RETURNS** | int | The number of aliases in the knowledge base. | +| Name | Type | Description | +| ----------- | ---- | -------------------------------------------- | +| **RETURNS** | int | The number of aliases in the knowledge base. | ## KnowledgeBase.get_alias_strings {#get_alias_strings tag="method"} @@ -145,9 +149,9 @@ Get a list of all aliases in the knowledge base. > all_aliases = kb.get_alias_strings() > ``` -| Name | Type | Description | -| ----------- | ---- | --------------------------------------------- | -| **RETURNS** | list | The list of aliases in the knowledge base. | +| Name | Type | Description | +| ----------- | ---- | ------------------------------------------ | +| **RETURNS** | list | The list of aliases in the knowledge base. | ## KnowledgeBase.get_candidates {#get_candidates tag="method"} @@ -160,10 +164,10 @@ of type [`Candidate`](/api/kb/#candidate_init). > candidates = kb.get_candidates("Douglas") > ``` -| Name | Type | Description | -| ------------- | ------------- | -------------------------------------------------- | -| `alias` | unicode | The textual mention or alias | -| **RETURNS** | iterable | The list of relevant `Candidate` objects | +| Name | Type | Description | +| ----------- | -------- | ---------------------------------------- | +| `alias` | str | The textual mention or alias | +| **RETURNS** | iterable | The list of relevant `Candidate` objects | ## KnowledgeBase.get_vector {#get_vector tag="method"} @@ -175,15 +179,15 @@ Given a certain entity ID, retrieve its pretrained entity vector. > vector = kb.get_vector("Q42") > ``` -| Name | Type | Description | -| ------------- | ------------- | -------------------------------------------------- | -| `entity` | unicode | The entity ID | -| **RETURNS** | vector | The entity vector | +| Name | Type | Description | +| ----------- | ------ | ----------------- | +| `entity` | str | The entity ID | +| **RETURNS** | vector | The entity vector | ## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"} -Given a certain entity ID and a certain textual mention, retrieve -the prior probability of the fact that the mention links to the entity ID. +Given a certain entity ID and a certain textual mention, retrieve the prior +probability of the fact that the mention links to the entity ID. > #### Example > @@ -191,11 +195,11 @@ the prior probability of the fact that the mention links to the entity ID. > probability = kb.get_prior_prob("Q42", "Douglas") > ``` -| Name | Type | Description | -| ------------- | ------------- | --------------------------------------------------------------- | -| `entity` | unicode | The entity ID | -| `alias` | unicode | The textual mention or alias | -| **RETURNS** | float | The prior probability of the `alias` referring to the `entity` | +| Name | Type | Description | +| ----------- | ----- | -------------------------------------------------------------- | +| `entity` | str | The entity ID | +| `alias` | str | The textual mention or alias | +| **RETURNS** | float | The prior probability of the `alias` referring to the `entity` | ## KnowledgeBase.dump {#dump tag="method"} @@ -207,14 +211,14 @@ Save the current state of the knowledge base to a directory. > kb.dump(loc) > ``` -| Name | Type | Description | -| ------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------ | -| `loc` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| ----- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `loc` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | ## KnowledgeBase.load_bulk {#load_bulk tag="method"} -Restore the state of the knowledge base from a given directory. Note that the [`Vocab`](/api/vocab) -should also be the same as the one used to create the KB. +Restore the state of the knowledge base from a given directory. Note that the +[`Vocab`](/api/vocab) should also be the same as the one used to create the KB. > #### Example > @@ -226,18 +230,16 @@ should also be the same as the one used to create the KB. > kb.load_bulk("/path/to/kb") > ``` - -| Name | Type | Description | -| ----------- | ---------------- | ----------------------------------------------------------------------------------------- | -| `loc` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| **RETURNS** | `KnowledgeBase` | The modified `KnowledgeBase` object. | - +| Name | Type | Description | +| ----------- | --------------- | -------------------------------------------------------------------------- | +| `loc` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| **RETURNS** | `KnowledgeBase` | The modified `KnowledgeBase` object. | ## Candidate.\_\_init\_\_ {#candidate_init tag="method"} Construct a `Candidate` object. Usually this constructor is not called directly, -but instead these objects are returned by the [`get_candidates`](/api/kb#get_candidates) method -of a `KnowledgeBase`. +but instead these objects are returned by the +[`get_candidates`](/api/kb#get_candidates) method of a `KnowledgeBase`. > #### Example > @@ -257,12 +259,12 @@ of a `KnowledgeBase`. ## Candidate attributes {#candidate_attributes} -| Name | Type | Description | -| ---------------------- | ------------ | ------------------------------------------------------------------ | -| `entity` | int | The entity's unique KB identifier | -| `entity_` | unicode | The entity's unique KB identifier | -| `alias` | int | The alias or textual mention | -| `alias_` | unicode | The alias or textual mention | -| `prior_prob` | long | The prior probability of the `alias` referring to the `entity` | -| `entity_freq` | long | The frequency of the entity in a typical corpus | -| `entity_vector` | vector | The pretrained vector of the entity | +| Name | Type | Description | +| --------------- | ------ | -------------------------------------------------------------- | +| `entity` | int | The entity's unique KB identifier | +| `entity_` | str | The entity's unique KB identifier | +| `alias` | int | The alias or textual mention | +| `alias_` | str | The alias or textual mention | +| `prior_prob` | long | The prior probability of the `alias` referring to the `entity` | +| `entity_freq` | long | The frequency of the entity in a typical corpus | +| `entity_vector` | vector | The pretrained vector of the entity | diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 703a0f678..496c89776 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -49,11 +49,11 @@ contain arbitrary whitespace. Alignment into the original string is preserved. > assert (doc[0].text, doc[0].head.tag_) == ("An", "NN") > ``` -| Name | Type | Description | -| ----------- | ------- | --------------------------------------------------------------------------------- | -| `text` | unicode | The text to be processed. | -| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | -| **RETURNS** | `Doc` | A container for accessing the annotations. | +| Name | Type | Description | +| ----------- | ----- | --------------------------------------------------------------------------------- | +| `text` | str | The text to be processed. | +| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | +| **RETURNS** | `Doc` | A container for accessing the annotations. | @@ -201,7 +201,7 @@ Create a pipeline component from a factory. | Name | Type | Description | | ----------- | -------- | ---------------------------------------------------------------------------------- | -| `name` | unicode | Factory name to look up in [`Language.factories`](/api/language#class-attributes). | +| `name` | str | Factory name to look up in [`Language.factories`](/api/language#class-attributes). | | `config` | dict | Configuration parameters to initialize component. | | **RETURNS** | callable | The pipeline component. | @@ -224,9 +224,9 @@ take a `Doc` object, modify it and return it. Only one of `before`, `after`, | Name | Type | Description | | ----------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `component` | callable | The pipeline component. | -| `name` | unicode | Name of pipeline component. Overwrites existing `component.name` attribute if available. If no `name` is set and the component exposes no name attribute, `component.__name__` is used. An error is raised if the name already exists in the pipeline. | -| `before` | unicode | Component name to insert component directly before. | -| `after` | unicode | Component name to insert component directly after: | +| `name` | str | Name of pipeline component. Overwrites existing `component.name` attribute if available. If no `name` is set and the component exposes no name attribute, `component.__name__` is used. An error is raised if the name already exists in the pipeline. | +| `before` | str | Component name to insert component directly before. | +| `after` | str | Component name to insert component directly after: | | `first` | bool | Insert component first / not first in the pipeline. | | `last` | bool | Insert component last / not last in the pipeline. | @@ -243,10 +243,10 @@ Check whether a component is present in the pipeline. Equivalent to > assert nlp.has_pipe("component") > ``` -| Name | Type | Description | -| ----------- | ------- | -------------------------------------------------------- | -| `name` | unicode | Name of the pipeline component to check. | -| **RETURNS** | bool | Whether a component of that name exists in the pipeline. | +| Name | Type | Description | +| ----------- | ---- | -------------------------------------------------------- | +| `name` | str | Name of the pipeline component to check. | +| **RETURNS** | bool | Whether a component of that name exists in the pipeline. | ## Language.get_pipe {#get_pipe tag="method" new="2"} @@ -261,7 +261,7 @@ Get a pipeline component for a given component name. | Name | Type | Description | | ----------- | -------- | -------------------------------------- | -| `name` | unicode | Name of the pipeline component to get. | +| `name` | str | Name of the pipeline component to get. | | **RETURNS** | callable | The pipeline component. | ## Language.replace_pipe {#replace_pipe tag="method" new="2"} @@ -276,7 +276,7 @@ Replace a component in the pipeline. | Name | Type | Description | | ----------- | -------- | --------------------------------- | -| `name` | unicode | Name of the component to replace. | +| `name` | str | Name of the component to replace. | | `component` | callable | The pipeline component to insert. | ## Language.rename_pipe {#rename_pipe tag="method" new="2"} @@ -292,10 +292,10 @@ added to the pipeline, you can also use the `name` argument on > nlp.rename_pipe("parser", "spacy_parser") > ``` -| Name | Type | Description | -| ---------- | ------- | -------------------------------- | -| `old_name` | unicode | Name of the component to rename. | -| `new_name` | unicode | New name of the component. | +| Name | Type | Description | +| ---------- | ---- | -------------------------------- | +| `old_name` | str | Name of the component to rename. | +| `new_name` | str | New name of the component. | ## Language.remove_pipe {#remove_pipe tag="method" new="2"} @@ -309,10 +309,10 @@ component function. > assert name == "parser" > ``` -| Name | Type | Description | -| ----------- | ------- | ----------------------------------------------------- | -| `name` | unicode | Name of the component to remove. | -| **RETURNS** | tuple | A `(name, component)` tuple of the removed component. | +| Name | Type | Description | +| ----------- | ----- | ----------------------------------------------------- | +| `name` | str | Name of the component to remove. | +| **RETURNS** | tuple | A `(name, component)` tuple of the removed component. | ## Language.select_pipes {#select_pipes tag="contextmanager, method" new="3"} @@ -342,12 +342,11 @@ latter case, all components not in the `enable` list, will be disabled. | Name | Type | Description | | ----------- | --------------- | ------------------------------------------------------------------------------------ | | `disable` | list | Names of pipeline components to disable. | -| `disable` | unicode | Name of pipeline component to disable. | +| `disable` | str | Name of pipeline component to disable. | | `enable` | list | Names of pipeline components that will not be disabled. | -| `enable` | unicode | Name of pipeline component that will not be disabled. | +| `enable` | str | Name of pipeline component that will not be disabled. | | **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. | - As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`: @@ -370,10 +369,10 @@ the model**. > nlp.to_disk("/path/to/models") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | ## Language.from_disk {#from_disk tag="method" new="2"} @@ -395,11 +394,11 @@ loaded object. > nlp = English().from_disk("/path/to/en_model") > ``` -| Name | Type | Description | -| ----------- | ---------------- | ----------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Language` | The modified `Language` object. | +| Name | Type | Description | +| ----------- | ------------ | ----------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Language` | The modified `Language` object. | @@ -480,11 +479,11 @@ per component. ## Class attributes {#class-attributes} -| Name | Type | Description | -| -------------------------------------- | ------- | ----------------------------------------------------------------------------------------------------------------------------------- | -| `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. | -| `lang` | unicode | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). | -| `factories` 2 | dict | Factories that create pre-defined pipeline components, e.g. the tagger, parser or entity recognizer, keyed by their component name. | +| Name | Type | Description | +| -------------------------------------- | ----- | ----------------------------------------------------------------------------------------------------------------------------------- | +| `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. | +| `lang` | str | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). | +| `factories` 2 | dict | Factories that create pre-defined pipeline components, e.g. the tagger, parser or entity recognizer, keyed by their component name. | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index f43e17fd3..16cd624f5 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -63,8 +63,8 @@ Lemmatize a string. | Name | Type | Description | | ------------ | ------------- | -------------------------------------------------------------------------------------------------------- | -| `string` | unicode | The string to lemmatize, e.g. the token text. | -| `univ_pos` | unicode / int | The token's universal part-of-speech tag. | +| `string` | str | The string to lemmatize, e.g. the token text. | +| `univ_pos` | str / int | The token's universal part-of-speech tag. | | `morphology` | dict / `None` | Morphological features following the [Universal Dependencies](http://universaldependencies.org/) scheme. | | **RETURNS** | list | The available lemmas for the string. | @@ -82,11 +82,11 @@ original string is returned. Languages can provide a > assert lemmatizer.lookup("going") == "go" > ``` -| Name | Type | Description | -| ----------- | ------- | ----------------------------------------------------------------------------------------------------------- | -| `string` | unicode | The string to look up. | -| `orth` | int | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. | -| **RETURNS** | unicode | The lemma if the string was found, otherwise the original string. | +| Name | Type | Description | +| ----------- | ---- | ----------------------------------------------------------------------------------------------------------- | +| `string` | str | The string to look up. | +| `orth` | int | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. | +| **RETURNS** | str | The lemma if the string was found, otherwise the original string. | ## Lemmatizer.is_base_form {#is_base_form tag="method"} @@ -102,11 +102,11 @@ lemmatization entirely. > assert is_base_form == True > ``` -| Name | Type | Description | -| ------------ | ------------- | --------------------------------------------------------------------------------------- | -| `univ_pos` | unicode / int | The token's universal part-of-speech tag. | -| `morphology` | dict | The token's morphological features. | -| **RETURNS** | bool | Whether the token's part-of-speech tag and morphological features describe a base form. | +| Name | Type | Description | +| ------------ | --------- | --------------------------------------------------------------------------------------- | +| `univ_pos` | str / int | The token's universal part-of-speech tag. | +| `morphology` | dict | The token's morphological features. | +| **RETURNS** | bool | Whether the token's part-of-speech tag and morphological features describe a base form. | ## Attributes {#attributes} diff --git a/website/docs/api/lookups.md b/website/docs/api/lookups.md index bd3b38303..b91d92646 100644 --- a/website/docs/api/lookups.md +++ b/website/docs/api/lookups.md @@ -56,10 +56,10 @@ Check if the lookups contain a table of a given name. Delegates to > assert "some_table" in lookups > ``` -| Name | Type | Description | -| ----------- | ------- | ----------------------------------------------- | -| `name` | unicode | Name of the table. | -| **RETURNS** | bool | Whether a table of that name is in the lookups. | +| Name | Type | Description | +| ----------- | ---- | ----------------------------------------------- | +| `name` | str | Name of the table. | +| **RETURNS** | bool | Whether a table of that name is in the lookups. | ## Lookups.tables {#tables tag="property"} @@ -91,7 +91,7 @@ exists. | Name | Type | Description | | ----------- | ----------------------------- | ---------------------------------- | -| `name` | unicode | Unique name of the table. | +| `name` | str | Unique name of the table. | | `data` | dict | Optional data to add to the table. | | **RETURNS** | [`Table`](/api/lookups#table) | The newly added table. | @@ -110,7 +110,7 @@ Get a table from the lookups. Raises an error if the table doesn't exist. | Name | Type | Description | | ----------- | ----------------------------- | ------------------ | -| `name` | unicode | Name of the table. | +| `name` | str | Name of the table. | | **RETURNS** | [`Table`](/api/lookups#table) | The table. | ## Lookups.remove_table {#remove_table tag="method"} @@ -128,7 +128,7 @@ Remove a table from the lookups. Raises an error if the table doesn't exist. | Name | Type | Description | | ----------- | ----------------------------- | ---------------------------- | -| `name` | unicode | Name of the table to remove. | +| `name` | str | Name of the table to remove. | | **RETURNS** | [`Table`](/api/lookups#table) | The removed table. | ## Lookups.has_table {#has_table tag="method"} @@ -144,10 +144,10 @@ Check if the lookups contain a table of a given name. Equivalent to > assert lookups.has_table("some_table") > ``` -| Name | Type | Description | -| ----------- | ------- | ----------------------------------------------- | -| `name` | unicode | Name of the table. | -| **RETURNS** | bool | Whether a table of that name is in the lookups. | +| Name | Type | Description | +| ----------- | ---- | ----------------------------------------------- | +| `name` | str | Name of the table. | +| **RETURNS** | bool | Whether a table of that name is in the lookups. | ## Lookups.to_bytes {#to_bytes tag="method"} @@ -191,9 +191,9 @@ which will be created if it doesn't exist. > lookups.to_disk("/path/to/lookups") > ``` -| Name | Type | Description | -| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | ## Lookups.from_disk {#from_disk tag="method"} @@ -208,10 +208,10 @@ the file doesn't exist. > lookups.from_disk("/path/to/lookups") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| **RETURNS** | `Lookups` | The loaded lookups. | +| Name | Type | Description | +| ----------- | ------------ | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| **RETURNS** | `Lookups` | The loaded lookups. | ## Table {#table tag="class, ordererddict"} @@ -238,7 +238,7 @@ Initialize a new table. | Name | Type | Description | | ----------- | ------- | ---------------------------------- | -| `name` | unicode | Optional table name for reference. | +| `name` | str | Optional table name for reference. | | **RETURNS** | `Table` | The newly constructed object. | ### Table.from_dict {#table.from_dict tag="classmethod"} @@ -256,7 +256,7 @@ Initialize a new table from a dict. | Name | Type | Description | | ----------- | ------- | ---------------------------------- | | `data` | dict | The dictionary. | -| `name` | unicode | Optional table name for reference. | +| `name` | str | Optional table name for reference. | | **RETURNS** | `Table` | The newly constructed object. | ### Table.set {#table.set tag="method"} @@ -273,10 +273,10 @@ Set a new key / value pair. String keys will be hashed. Same as > assert table["foo"] == "bar" > ``` -| Name | Type | Description | -| ------- | ------------- | ----------- | -| `key` | unicode / int | The key. | -| `value` | - | The value. | +| Name | Type | Description | +| ------- | --------- | ----------- | +| `key` | str / int | The key. | +| `value` | - | The value. | ### Table.to_bytes {#table.to_bytes tag="method"} @@ -313,6 +313,6 @@ Load a table from a bytestring. | Name | Type | Description | | -------------- | --------------------------- | ----------------------------------------------------- | -| `name` | unicode | Table name. | +| `name` | str | Table name. | | `default_size` | int | Default size of bloom filters if no data is provided. | | `bloom` | `preshed.bloom.BloomFilter` | The bloom filters. | diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index bfd4fb0ec..8a872558c 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -125,10 +125,10 @@ Check whether the matcher contains rules for a match ID. > assert 'Rule' in matcher > ``` -| Name | Type | Description | -| ----------- | ------- | ----------------------------------------------------- | -| `key` | unicode | The match ID. | -| **RETURNS** | bool | Whether the matcher contains rules for this match ID. | +| Name | Type | Description | +| ----------- | ---- | ----------------------------------------------------- | +| `key` | str | The match ID. | +| **RETURNS** | bool | Whether the matcher contains rules for this match ID. | ## Matcher.add {#add tag="method" new="2"} @@ -153,7 +153,7 @@ overwritten. | Name | Type | Description | | ----------- | ------------------ | --------------------------------------------------------------------------------------------- | -| `match_id` | unicode | An ID for the thing you're matching. | +| `match_id` | str | An ID for the thing you're matching. | | `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | | `*patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. | @@ -188,9 +188,9 @@ exist. > assert "Rule" not in matcher > ``` -| Name | Type | Description | -| ----- | ------- | ------------------------- | -| `key` | unicode | The ID of the match rule. | +| Name | Type | Description | +| ----- | ---- | ------------------------- | +| `key` | str | The ID of the match rule. | ## Matcher.get {#get tag="method" new="2"} @@ -204,7 +204,7 @@ Retrieve the pattern stored for a key. Returns the rule as an > on_match, patterns = matcher.get("Rule") > ``` -| Name | Type | Description | -| ----------- | ------- | --------------------------------------------- | -| `key` | unicode | The ID of the match rule. | -| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. | +| Name | Type | Description | +| ----------- | ----- | --------------------------------------------- | +| `key` | str | The ID of the match rule. | +| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. | diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md index a72277420..fa6729f41 100644 --- a/website/docs/api/phrasematcher.md +++ b/website/docs/api/phrasematcher.md @@ -133,10 +133,10 @@ Check whether the matcher contains rules for a match ID. > assert "OBAMA" in matcher > ``` -| Name | Type | Description | -| ----------- | ------- | ----------------------------------------------------- | -| `key` | unicode | The match ID. | -| **RETURNS** | bool | Whether the matcher contains rules for this match ID. | +| Name | Type | Description | +| ----------- | ---- | ----------------------------------------------------- | +| `key` | str | The match ID. | +| **RETURNS** | bool | Whether the matcher contains rules for this match ID. | ## PhraseMatcher.add {#add tag="method"} @@ -162,7 +162,7 @@ overwritten. | Name | Type | Description | | ---------- | ------------------ | --------------------------------------------------------------------------------------------- | -| `match_id` | unicode | An ID for the thing you're matching. | +| `match_id` | str | An ID for the thing you're matching. | | `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | | `*docs` | `Doc` | `Doc` objects of the phrases to match. | @@ -198,6 +198,6 @@ does not exist. > assert "OBAMA" not in matcher > ``` -| Name | Type | Description | -| ----- | ------- | ------------------------- | -| `key` | unicode | The ID of the match rule. | +| Name | Type | Description | +| ----- | ---- | ------------------------- | +| `key` | str | The ID of the match rule. | diff --git a/website/docs/api/pipeline-functions.md b/website/docs/api/pipeline-functions.md index 6e2b473b1..fc417845c 100644 --- a/website/docs/api/pipeline-functions.md +++ b/website/docs/api/pipeline-functions.md @@ -112,8 +112,8 @@ end of the pipeline and after all other components. -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------------------------ | -| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. | -| `label` | unicode | The subtoken dependency label. Defaults to `"subtok"`. | -| **RETURNS** | `Doc` | The modified `Doc` with merged subtokens. | +| Name | Type | Description | +| ----------- | ----- | ------------------------------------------------------------ | +| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. | +| `label` | str | The subtoken dependency label. Defaults to `"subtok"`. | +| **RETURNS** | `Doc` | The modified `Doc` with merged subtokens. | diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md index c9b935f22..03e843fcc 100644 --- a/website/docs/api/sentencizer.md +++ b/website/docs/api/sentencizer.md @@ -81,9 +81,9 @@ a file `sentencizer.json`. This also happens automatically when you save an > sentencizer.to_disk("/path/to/sentencizer.jsonl") > ``` -| Name | Type | Description | -| ------ | ---------------- | ---------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| ------ | ------------ | ---------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | ## Sentencizer.from_disk {#from_disk tag="method"} @@ -98,10 +98,10 @@ added to its pipeline. > sentencizer.from_disk("/path/to/sentencizer.json") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. | -| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. | +| Name | Type | Description | +| ----------- | ------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. | +| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. | ## Sentencizer.to_bytes {#to_bytes tag="method"} diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 3833bbca9..c41d9aa03 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -110,7 +110,7 @@ For details, see the documentation on | Name | Type | Description | | --------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------- | -| `name` | unicode | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `span._.my_attr`. | +| `name` | str | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `span._.my_attr`. | | `default` | - | Optional default value of the attribute if no getter or method is defined. | | `method` | callable | Set a custom method on the object, for example `span._.compare(other_span)`. | | `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. | @@ -132,10 +132,10 @@ Look up a previously registered extension by name. Returns a 4-tuple > assert extension == (False, None, None, None) > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------------------------- | -| `name` | unicode | Name of the extension. | -| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | +| Name | Type | Description | +| ----------- | ----- | ------------------------------------------------------------- | +| `name` | str | Name of the extension. | +| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | ## Span.has_extension {#has_extension tag="classmethod" new="2"} @@ -149,10 +149,10 @@ Check whether an extension has been registered on the `Span` class. > assert Span.has_extension("is_city") > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------ | -| `name` | unicode | Name of the extension to check. | -| **RETURNS** | bool | Whether the extension has been registered. | +| Name | Type | Description | +| ----------- | ---- | ------------------------------------------ | +| `name` | str | Name of the extension to check. | +| **RETURNS** | bool | Whether the extension has been registered. | ## Span.remove_extension {#remove_extension tag="classmethod" new="2.0.12"} @@ -167,10 +167,10 @@ Remove a previously registered extension. > assert not Span.has_extension("is_city") > ``` -| Name | Type | Description | -| ----------- | ------- | --------------------------------------------------------------------- | -| `name` | unicode | Name of the extension. | -| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | +| Name | Type | Description | +| ----------- | ----- | --------------------------------------------------------------------- | +| `name` | str | Name of the extension. | +| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | ## Span.char_span {#char_span tag="method" new="2.2.4"} @@ -497,16 +497,16 @@ The L2 norm of the span's vector representation. | `end` | int | The token offset for the end of the span. | | `start_char` | int | The character offset for the start of the span. | | `end_char` | int | The character offset for the end of the span. | -| `text` | unicode | A unicode representation of the span text. | -| `text_with_ws` | unicode | The text content of the span with a trailing whitespace character if the last token has one. | +| `text` | str | A unicode representation of the span text. | +| `text_with_ws` | str | The text content of the span with a trailing whitespace character if the last token has one. | | `orth` | int | ID of the verbatim text content. | -| `orth_` | unicode | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. | +| `orth_` | str | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. | | `label` | int | The hash value of the span's label. | -| `label_` | unicode | The span's label. | -| `lemma_` | unicode | The span's lemma. | +| `label_` | str | The span's label. | +| `lemma_` | str | The span's lemma. | | `kb_id` | int | The hash value of the knowledge base ID referred to by the span. | -| `kb_id_` | unicode | The knowledge base ID referred to by the span. | +| `kb_id_` | str | The knowledge base ID referred to by the span. | | `ent_id` | int | The hash value of the named entity the token is an instance of. | -| `ent_id_` | unicode | The string ID of the named entity the token is an instance of. | +| `ent_id_` | str | The string ID of the named entity the token is an instance of. | | `sentiment` | float | A scalar value indicating the positivity or negativity of the span. | | `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | diff --git a/website/docs/api/stringstore.md b/website/docs/api/stringstore.md index 268f19125..922174c78 100644 --- a/website/docs/api/stringstore.md +++ b/website/docs/api/stringstore.md @@ -55,7 +55,7 @@ Retrieve a string from a given hash, or vice versa. | Name | Type | Description | | -------------- | ------------------------ | -------------------------- | | `string_or_id` | bytes, unicode or uint64 | The value to encode. | -| **RETURNS** | unicode or int | The value to be retrieved. | +| **RETURNS** | str or int | The value to be retrieved. | ## StringStore.\_\_contains\_\_ {#contains tag="method"} @@ -69,10 +69,10 @@ Check whether a string is in the store. > assert not "cherry" in stringstore > ``` -| Name | Type | Description | -| ----------- | ------- | -------------------------------------- | -| `string` | unicode | The string to check. | -| **RETURNS** | bool | Whether the store contains the string. | +| Name | Type | Description | +| ----------- | ---- | -------------------------------------- | +| `string` | str | The string to check. | +| **RETURNS** | bool | Whether the store contains the string. | ## StringStore.\_\_iter\_\_ {#iter tag="method"} @@ -87,9 +87,9 @@ store will always include an empty string `''` at position `0`. > assert all_strings == ["apple", "orange"] > ``` -| Name | Type | Description | -| ---------- | ------- | ---------------------- | -| **YIELDS** | unicode | A string in the store. | +| Name | Type | Description | +| ---------- | ---- | ---------------------- | +| **YIELDS** | str | A string in the store. | ## StringStore.add {#add tag="method" new="2"} @@ -106,10 +106,10 @@ Add a string to the `StringStore`. > assert stringstore["banana"] == banana_hash > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------------ | -| `string` | unicode | The string to add. | -| **RETURNS** | uint64 | The string's hash value. | +| Name | Type | Description | +| ----------- | ------ | ------------------------ | +| `string` | str | The string to add. | +| **RETURNS** | uint64 | The string's hash value. | ## StringStore.to_disk {#to_disk tag="method" new="2"} @@ -121,9 +121,9 @@ Save the current state to a directory. > stringstore.to_disk("/path/to/strings") > ``` -| Name | Type | Description | -| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | ## StringStore.from_disk {#from_disk tag="method" new="2"} @@ -136,10 +136,10 @@ Loads state from a directory. Modifies the object in place and returns it. > stringstore = StringStore().from_disk("/path/to/strings") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| **RETURNS** | `StringStore` | The modified `StringStore` object. | +| Name | Type | Description | +| ----------- | ------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| **RETURNS** | `StringStore` | The modified `StringStore` object. | ## StringStore.to_bytes {#to_bytes tag="method"} @@ -185,7 +185,7 @@ Get a 64-bit hash for a given string. > assert hash_string("apple") == 8566208034543834098 > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------- | -| `string` | unicode | The string to hash. | -| **RETURNS** | uint64 | The hash. | +| Name | Type | Description | +| ----------- | ------ | ------------------- | +| `string` | str | The string to hash. | +| **RETURNS** | uint64 | The hash. | diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index bd3382f89..f14da3ac5 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -229,10 +229,10 @@ Add a new label to the pipe. > tagger.add_label("MY_LABEL", {POS: 'NOUN'}) > ``` -| Name | Type | Description | -| -------- | ------- | --------------------------------------------------------------- | -| `label` | unicode | The label to add. | -| `values` | dict | Optional values to map to the label, e.g. a tag map dictionary. | +| Name | Type | Description | +| -------- | ---- | --------------------------------------------------------------- | +| `label` | str | The label to add. | +| `values` | dict | Optional values to map to the label, e.g. a tag map dictionary. | ## Tagger.to_disk {#to_disk tag="method"} @@ -245,10 +245,10 @@ Serialize the pipe to disk. > tagger.to_disk("/path/to/tagger") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## Tagger.from_disk {#from_disk tag="method"} @@ -261,11 +261,11 @@ Load the pipe from disk. Modifies the object in place and returns it. > tagger.from_disk("/path/to/tagger") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tagger` | The modified `Tagger` object. | +| Name | Type | Description | +| ----------- | ------------ | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tagger` | The modified `Tagger` object. | ## Tagger.to_bytes {#to_bytes tag="method"} diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index 1a0280265..dc1c083ac 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -44,7 +44,7 @@ shortcut for this and instantiate the component using its string name and | `vocab` | `Vocab` | The shared vocabulary. | | `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. | | `exclusive_classes` | bool | Make categories mutually exclusive. Defaults to `False`. | -| `architecture` | unicode | Model architecture to use, see [architectures](#architectures) for details. Defaults to `"ensemble"`. | +| `architecture` | str | Model architecture to use, see [architectures](#architectures) for details. Defaults to `"ensemble"`. | | **RETURNS** | `TextCategorizer` | The newly constructed object. | ### Architectures {#architectures new="2.1"} @@ -247,9 +247,9 @@ Add a new label to the pipe. > textcat.add_label("MY_LABEL") > ``` -| Name | Type | Description | -| ------- | ------- | ----------------- | -| `label` | unicode | The label to add. | +| Name | Type | Description | +| ------- | ---- | ----------------- | +| `label` | str | The label to add. | ## TextCategorizer.to_disk {#to_disk tag="method"} @@ -262,10 +262,10 @@ Serialize the pipe to disk. > textcat.to_disk("/path/to/textcat") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## TextCategorizer.from_disk {#from_disk tag="method"} @@ -280,7 +280,7 @@ Load the pipe from disk. Modifies the object in place and returns it. | Name | Type | Description | | ----------- | ----------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. | diff --git a/website/docs/api/token.md b/website/docs/api/token.md index c30c01c20..1accbe062 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -58,7 +58,7 @@ For details, see the documentation on | Name | Type | Description | | --------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------- | -| `name` | unicode | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `token._.my_attr`. | +| `name` | str | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `token._.my_attr`. | | `default` | - | Optional default value of the attribute if no getter or method is defined. | | `method` | callable | Set a custom method on the object, for example `token._.compare(other_token)`. | | `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. | @@ -80,10 +80,10 @@ Look up a previously registered extension by name. Returns a 4-tuple > assert extension == (False, None, None, None) > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------------------------- | -| `name` | unicode | Name of the extension. | -| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | +| Name | Type | Description | +| ----------- | ----- | ------------------------------------------------------------- | +| `name` | str | Name of the extension. | +| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | ## Token.has_extension {#has_extension tag="classmethod" new="2"} @@ -97,10 +97,10 @@ Check whether an extension has been registered on the `Token` class. > assert Token.has_extension("is_fruit") > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------ | -| `name` | unicode | Name of the extension to check. | -| **RETURNS** | bool | Whether the extension has been registered. | +| Name | Type | Description | +| ----------- | ---- | ------------------------------------------ | +| `name` | str | Name of the extension to check. | +| **RETURNS** | bool | Whether the extension has been registered. | ## Token.remove_extension {#remove_extension tag="classmethod" new=""2.0.11""} @@ -115,10 +115,10 @@ Remove a previously registered extension. > assert not Token.has_extension("is_fruit") > ``` -| Name | Type | Description | -| ----------- | ------- | --------------------------------------------------------------------- | -| `name` | unicode | Name of the extension. | -| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | +| Name | Type | Description | +| ----------- | ----- | --------------------------------------------------------------------- | +| `name` | str | Name of the extension. | +| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | ## Token.check_flag {#check_flag tag="method"} @@ -408,71 +408,71 @@ The L2 norm of the token's vector representation. ## Attributes {#attributes} -| Name | Type | Description | -| -------------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The parent document. | -| `sent` 2.0.12 | `Span` | The sentence span that this token is a part of. | -| `text` | unicode | Verbatim text content. | -| `text_with_ws` | unicode | Text content, with trailing space character if present. | -| `whitespace_` | unicode | Trailing space character if present. | -| `orth` | int | ID of the verbatim text content. | -| `orth_` | unicode | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. | -| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | -| `tensor` 2.1.7 | `ndarray` | The tokens's slice of the parent `Doc`'s tensor. | -| `head` | `Token` | The syntactic parent, or "governor", of this token. | -| `left_edge` | `Token` | The leftmost token of this token's syntactic descendants. | -| `right_edge` | `Token` | The rightmost token of this token's syntactic descendants. | -| `i` | int | The index of the token within the parent document. | -| `ent_type` | int | Named entity type. | -| `ent_type_` | unicode | Named entity type. | -| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | -| `ent_iob_` | unicode | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. | -| `ent_kb_id` 2.2 | int | Knowledge base ID that refers to the named entity this token is a part of, if any. | -| `ent_kb_id_` 2.2 | unicode | Knowledge base ID that refers to the named entity this token is a part of, if any. | -| `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | -| `ent_id_` | unicode | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | -| `lemma` | int | Base form of the token, with no inflectional suffixes. | -| `lemma_` | unicode | Base form of the token, with no inflectional suffixes. | -| `norm` | int | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | -| `norm_` | unicode | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | -| `lower` | int | Lowercase form of the token. | -| `lower_` | unicode | Lowercase form of the token text. Equivalent to `Token.text.lower()`. | +| Name | Type | Description | +| -------------------------------------------- | ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The parent document. | +| `sent` 2.0.12 | `Span` | The sentence span that this token is a part of. | +| `text` | str | Verbatim text content. | +| `text_with_ws` | str | Text content, with trailing space character if present. | +| `whitespace_` | str | Trailing space character if present. | +| `orth` | int | ID of the verbatim text content. | +| `orth_` | str | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. | +| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | +| `tensor` 2.1.7 | `ndarray` | The tokens's slice of the parent `Doc`'s tensor. | +| `head` | `Token` | The syntactic parent, or "governor", of this token. | +| `left_edge` | `Token` | The leftmost token of this token's syntactic descendants. | +| `right_edge` | `Token` | The rightmost token of this token's syntactic descendants. | +| `i` | int | The index of the token within the parent document. | +| `ent_type` | int | Named entity type. | +| `ent_type_` | str | Named entity type. | +| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | +| `ent_iob_` | str | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. | +| `ent_kb_id` 2.2 | int | Knowledge base ID that refers to the named entity this token is a part of, if any. | +| `ent_kb_id_` 2.2 | str | Knowledge base ID that refers to the named entity this token is a part of, if any. | +| `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | +| `ent_id_` | str | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | +| `lemma` | int | Base form of the token, with no inflectional suffixes. | +| `lemma_` | str | Base form of the token, with no inflectional suffixes. | +| `norm` | int | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | +| `norm_` | str | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | +| `lower` | int | Lowercase form of the token. | +| `lower_` | str | Lowercase form of the token text. Equivalent to `Token.text.lower()`. | | `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | -| `shape_` | unicode | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | -| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. | -| `prefix_` | unicode | A length-N substring from the start of the token. Defaults to `N=1`. | -| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. | -| `suffix_` | unicode | Length-N substring from the end of the token. Defaults to `N=3`. | -| `is_alpha` | bool | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. | -| `is_ascii` | bool | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. | -| `is_digit` | bool | Does the token consist of digits? Equivalent to `token.text.isdigit()`. | -| `is_lower` | bool | Is the token in lowercase? Equivalent to `token.text.islower()`. | -| `is_upper` | bool | Is the token in uppercase? Equivalent to `token.text.isupper()`. | -| `is_title` | bool | Is the token in titlecase? Equivalent to `token.text.istitle()`. | -| `is_punct` | bool | Is the token punctuation? | -| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `'('` ? | -| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `')'` ? | -| `is_space` | bool | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. | -| `is_bracket` | bool | Is the token a bracket? | -| `is_quote` | bool | Is the token a quotation mark? | -| `is_currency` 2.0.8 | bool | Is the token a currency symbol? | -| `like_url` | bool | Does the token resemble a URL? | -| `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. | -| `like_email` | bool | Does the token resemble an email address? | -| `is_oov` | bool | Is the token out-of-vocabulary? | -| `is_stop` | bool | Is the token part of a "stop list"? | -| `pos` | int | Coarse-grained part-of-speech. | -| `pos_` | unicode | Coarse-grained part-of-speech. | -| `tag` | int | Fine-grained part-of-speech. | -| `tag_` | unicode | Fine-grained part-of-speech. | -| `dep` | int | Syntactic dependency relation. | -| `dep_` | unicode | Syntactic dependency relation. | -| `lang` | int | Language of the parent document's vocabulary. | -| `lang_` | unicode | Language of the parent document's vocabulary. | -| `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). | -| `idx` | int | The character offset of the token within the parent document. | -| `sentiment` | float | A scalar value indicating the positivity or negativity of the token. | -| `lex_id` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | -| `rank` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | -| `cluster` | int | Brown cluster ID. | -| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | +| `shape_` | str | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | +| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. | +| `prefix_` | str | A length-N substring from the start of the token. Defaults to `N=1`. | +| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. | +| `suffix_` | str | Length-N substring from the end of the token. Defaults to `N=3`. | +| `is_alpha` | bool | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. | +| `is_ascii` | bool | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. | +| `is_digit` | bool | Does the token consist of digits? Equivalent to `token.text.isdigit()`. | +| `is_lower` | bool | Is the token in lowercase? Equivalent to `token.text.islower()`. | +| `is_upper` | bool | Is the token in uppercase? Equivalent to `token.text.isupper()`. | +| `is_title` | bool | Is the token in titlecase? Equivalent to `token.text.istitle()`. | +| `is_punct` | bool | Is the token punctuation? | +| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `'('` ? | +| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `')'` ? | +| `is_space` | bool | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. | +| `is_bracket` | bool | Is the token a bracket? | +| `is_quote` | bool | Is the token a quotation mark? | +| `is_currency` 2.0.8 | bool | Is the token a currency symbol? | +| `like_url` | bool | Does the token resemble a URL? | +| `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. | +| `like_email` | bool | Does the token resemble an email address? | +| `is_oov` | bool | Is the token out-of-vocabulary? | +| `is_stop` | bool | Is the token part of a "stop list"? | +| `pos` | int | Coarse-grained part-of-speech. | +| `pos_` | str | Coarse-grained part-of-speech. | +| `tag` | int | Fine-grained part-of-speech. | +| `tag_` | str | Fine-grained part-of-speech. | +| `dep` | int | Syntactic dependency relation. | +| `dep_` | str | Syntactic dependency relation. | +| `lang` | int | Language of the parent document's vocabulary. | +| `lang_` | str | Language of the parent document's vocabulary. | +| `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). | +| `idx` | int | The character offset of the token within the parent document. | +| `sentiment` | float | A scalar value indicating the positivity or negativity of the token. | +| `lex_id` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | +| `rank` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | +| `cluster` | int | Brown cluster ID. | +| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index 7462af739..c71f849ad 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -34,15 +34,15 @@ the > tokenizer = nlp.Defaults.create_tokenizer(nlp) > ``` -| Name | Type | Description | -| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | A storage container for lexical types. | -| `rules` | dict | Exceptions and special-cases for the tokenizer. | -| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | -| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | -| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | -| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. | -| **RETURNS** | `Tokenizer` | The newly constructed object. | +| Name | Type | Description | +| ---------------- | ----------- | ------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | A storage container for lexical types. | +| `rules` | dict | Exceptions and special-cases for the tokenizer. | +| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | +| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | +| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | +| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. | +| **RETURNS** | `Tokenizer` | The newly constructed object. | ## Tokenizer.\_\_call\_\_ {#call tag="method"} @@ -55,10 +55,10 @@ Tokenize a string. > assert len(tokens) == 4 > ``` -| Name | Type | Description | -| ----------- | ------- | --------------------------------------- | -| `string` | unicode | The string to tokenize. | -| **RETURNS** | `Doc` | A container for linguistic annotations. | +| Name | Type | Description | +| ----------- | ----- | --------------------------------------- | +| `string` | str | The string to tokenize. | +| **RETURNS** | `Doc` | A container for linguistic annotations. | ## Tokenizer.pipe {#pipe tag="method"} @@ -82,20 +82,20 @@ Tokenize a stream of texts. Find internal split points of the string. -| Name | Type | Description | -| ----------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | -| `string` | unicode | The string to split. | -| **RETURNS** | list | A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. | +| Name | Type | Description | +| ----------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------- | +| `string` | str | The string to split. | +| **RETURNS** | list | A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. | ## Tokenizer.find_prefix {#find_prefix tag="method"} Find the length of a prefix that should be segmented from the string, or `None` if no prefix rules match. -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------------------ | -| `string` | unicode | The string to segment. | -| **RETURNS** | int | The length of the prefix if present, otherwise `None`. | +| Name | Type | Description | +| ----------- | ---- | ------------------------------------------------------ | +| `string` | str | The string to segment. | +| **RETURNS** | int | The length of the prefix if present, otherwise `None`. | ## Tokenizer.find_suffix {#find_suffix tag="method"} @@ -104,7 +104,7 @@ if no suffix rules match. | Name | Type | Description | | ----------- | ------------ | ------------------------------------------------------ | -| `string` | unicode | The string to segment. | +| `string` | str | The string to segment. | | **RETURNS** | int / `None` | The length of the suffix if present, otherwise `None`. | ## Tokenizer.add_special_case {#add_special_case tag="method"} @@ -125,7 +125,7 @@ and examples. | Name | Type | Description | | ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `string` | unicode | The string to specially tokenize. | +| `string` | str | The string to specially tokenize. | | `token_attrs` | iterable | A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. | ## Tokenizer.explain {#explain tag="method"} @@ -142,10 +142,10 @@ produced are identical to `Tokenizer.__call__` except for whitespace tokens. > assert [t[1] for t in tok_exp] == ["(", "do", "n't", ")"] > ``` -| Name | Type | Description | -| ------------| -------- | --------------------------------------------------- | -| `string` | unicode | The string to tokenize with the debugging tokenizer | -| **RETURNS** | list | A list of `(pattern_string, token_string)` tuples | +| Name | Type | Description | +| ----------- | ---- | --------------------------------------------------- | +| `string` | str | The string to tokenize with the debugging tokenizer | +| **RETURNS** | list | A list of `(pattern_string, token_string)` tuples | ## Tokenizer.to_disk {#to_disk tag="method"} @@ -158,10 +158,10 @@ Serialize the tokenizer to disk. > tokenizer.to_disk("/path/to/tokenizer") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## Tokenizer.from_disk {#from_disk tag="method"} @@ -174,11 +174,11 @@ Load the tokenizer from disk. Modifies the object in place and returns it. > tokenizer.from_disk("/path/to/tokenizer") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. | +| Name | Type | Description | +| ----------- | ------------ | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. | ## Tokenizer.to_bytes {#to_bytes tag="method"} @@ -217,14 +217,14 @@ it. ## Attributes {#attributes} -| Name | Type | Description | -| ---------------- | ------- | --------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | -| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. | -| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. | -| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. | -| `token_match` | - | A function matching the signature of `re.compile(string).match to find token matches. Returns an `re.MatchObject` or `None. | -| `rules` | dict | A dictionary of tokenizer exceptions and special cases. | +| Name | Type | Description | +| ---------------- | ------- | -------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | +| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. | +| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. | +| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. | +| `token_match` | - | A function matching the signature of `re.compile(string).match to find token matches. Returns an`re.MatchObject`or`None. | +| `rules` | dict | A dictionary of tokenizer exceptions and special cases. | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 2360ad472..bdd094021 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -32,11 +32,11 @@ class. The data will be loaded in via > nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"]) > ``` -| Name | Type | Description | -| ----------- | ---------------- | --------------------------------------------------------------------------------- | -| `name` | unicode / `Path` | Model to load, i.e. shortcut link, package name or path. | -| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | -| **RETURNS** | `Language` | A `Language` object with the loaded model. | +| Name | Type | Description | +| ----------- | ------------ | --------------------------------------------------------------------------------- | +| `name` | str / `Path` | Model to load, i.e. shortcut link, package name or path. | +| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | +| **RETURNS** | `Language` | A `Language` object with the loaded model. | Essentially, `spacy.load()` is a convenience wrapper that reads the language ID and pipeline components from a model's `meta.json`, initializes the `Language` @@ -79,7 +79,7 @@ Create a blank model of a given language class. This function is the twin of | Name | Type | Description | | ----------- | ---------- | ------------------------------------------------------------------------------------------------ | -| `name` | unicode | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. | +| `name` | str | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. | | `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | | **RETURNS** | `Language` | An empty `Language` object of the appropriate subclass. | @@ -98,10 +98,10 @@ meta data as a dictionary instead, you can use the `meta` attribute on your > spacy.info("de", markdown=True) > ``` -| Name | Type | Description | -| ---------- | ------- | ------------------------------------------------------------- | -| `model` | unicode | A model, i.e. shortcut link, package name or path (optional). | -| `markdown` | bool | Print information as Markdown. | +| Name | Type | Description | +| ---------- | ---- | ------------------------------------------------------------- | +| `model` | str | A model, i.e. shortcut link, package name or path (optional). | +| `markdown` | bool | Print information as Markdown. | ### spacy.explain {#spacy.explain tag="function"} @@ -122,10 +122,10 @@ list of available terms, see > # world NN noun, singular or mass > ``` -| Name | Type | Description | -| ----------- | ------- | -------------------------------------------------------- | -| `term` | unicode | Term to explain. | -| **RETURNS** | unicode | The explanation, or `None` if not found in the glossary. | +| Name | Type | Description | +| ----------- | ---- | -------------------------------------------------------- | +| `term` | str | Term to explain. | +| **RETURNS** | str | The explanation, or `None` if not found in the glossary. | ### spacy.prefer_gpu {#spacy.prefer_gpu tag="function" new="2.0.14"} @@ -189,13 +189,13 @@ browser. Will run a simple web server. | Name | Type | Description | Default | | --------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | ----------- | | `docs` | list, `Doc`, `Span` | Document(s) to visualize. | -| `style` | unicode | Visualization style, `'dep'` or `'ent'`. | `'dep'` | +| `style` | str | Visualization style, `'dep'` or `'ent'`. | `'dep'` | | `page` | bool | Render markup as full HTML page. | `True` | | `minify` | bool | Minify HTML markup. | `False` | | `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` | | `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` | | `port` | int | Port to serve visualization. | `5000` | -| `host` | unicode | Host to serve visualization. | `'0.0.0.0'` | +| `host` | str | Host to serve visualization. | `'0.0.0.0'` | ### displacy.render {#displacy.render tag="method" new="2"} @@ -214,13 +214,13 @@ Render a dependency parse tree or named entity visualization. | Name | Type | Description | Default | | ----------- | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- | | `docs` | list, `Doc`, `Span` | Document(s) to visualize. | -| `style` | unicode | Visualization style, `'dep'` or `'ent'`. | `'dep'` | +| `style` | str | Visualization style, `'dep'` or `'ent'`. | `'dep'` | | `page` | bool | Render markup as full HTML page. | `False` | | `minify` | bool | Minify HTML markup. | `False` | | `jupyter` | bool | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None`. | `None` | | `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` | | `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` | -| **RETURNS** | unicode | Rendered HTML markup. | +| **RETURNS** | str | Rendered HTML markup. | ### Visualizer options {#displacy_options} @@ -236,22 +236,22 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="dep", options=options) > ``` -| Name | Type | Description | Default | -| ------------------------------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- | -| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` | -| `add_lemma` 2.2.4 | bool | Print the lemma's in a separate row below the token texts. | `False` | -| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` | -| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` | -| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | -| `color` | unicode | Text color (HEX, RGB or color names). | `'#000000'` | -| `bg` | unicode | Background color (HEX, RGB or color names). | `'#ffffff'` | -| `font` | unicode | Font name or font family for all text. | `'Arial'` | -| `offset_x` | int | Spacing on left side of the SVG in px. | `50` | -| `arrow_stroke` | int | Width of arrow path in px. | `2` | -| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) | -| `arrow_spacing` | int | Spacing between arrows in px to avoid overlaps. | `20` / `12` (compact) | -| `word_spacing` | int | Vertical spacing between words and arcs in px. | `45` | -| `distance` | int | Distance between words in px. | `175` / `150` (compact) | +| Name | Type | Description | Default | +| ------------------------------------------ | ---- | --------------------------------------------------------------------------------------------------------------- | ----------------------- | +| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` | +| `add_lemma` 2.2.4 | bool | Print the lemma's in a separate row below the token texts. | `False` | +| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` | +| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` | +| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | +| `color` | str | Text color (HEX, RGB or color names). | `'#000000'` | +| `bg` | str | Background color (HEX, RGB or color names). | `'#ffffff'` | +| `font` | str | Font name or font family for all text. | `'Arial'` | +| `offset_x` | int | Spacing on left side of the SVG in px. | `50` | +| `arrow_stroke` | int | Width of arrow path in px. | `2` | +| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) | +| `arrow_spacing` | int | Spacing between arrows in px to avoid overlaps. | `20` / `12` (compact) | +| `word_spacing` | int | Vertical spacing between words and arcs in px. | `45` | +| `distance` | int | Distance between words in px. | `175` / `150` (compact) | #### Named Entity Visualizer options {#displacy_options-ent} @@ -263,11 +263,11 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="ent", options=options) > ``` -| Name | Type | Description | Default | -| --------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ | -| `ents` | list | Entity types to highlight (`None` for all types). | `None` | -| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` | -| `template` 2.2 | unicode | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) | +| Name | Type | Description | Default | +| --------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ | +| `ents` | list | Entity types to highlight (`None` for all types). | `None` | +| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` | +| `template` 2.2 | str | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) | By default, displaCy comes with colors for all [entity types supported by spaCy](/api/annotation#named-entities). If you're @@ -308,9 +308,9 @@ Set custom path to the data directory where spaCy looks for models. > # PosixPath('/custom/path') > ``` -| Name | Type | Description | -| ------ | ---------------- | --------------------------- | -| `path` | unicode / `Path` | Path to new data directory. | +| Name | Type | Description | +| ------ | ------------ | --------------------------- | +| `path` | str / `Path` | Path to new data directory. | ### util.get_lang_class {#util.get_lang_class tag="function"} @@ -330,7 +330,7 @@ you can use the [`set_lang_class`](/api/top-level#util.set_lang_class) helper. | Name | Type | Description | | ----------- | ---------- | -------------------------------------- | -| `lang` | unicode | Two-letter language code, e.g. `'en'`. | +| `lang` | str | Two-letter language code, e.g. `'en'`. | | **RETURNS** | `Language` | Language class. | ### util.set_lang_class {#util.set_lang_class tag="function"} @@ -352,7 +352,7 @@ the two-letter language code. | Name | Type | Description | | ------ | ---------- | -------------------------------------- | -| `name` | unicode | Two-letter language code, e.g. `'en'`. | +| `name` | str | Two-letter language code, e.g. `'en'`. | | `cls` | `Language` | The language class, e.g. `English`. | ### util.lang_class_is_loaded {#util.lang_class_is_loaded tag="function" new="2.1"} @@ -368,10 +368,10 @@ loaded lazily, to avoid expensive setup code associated with the language data. > assert util.lang_class_is_loaded("de") is False > ``` -| Name | Type | Description | -| ----------- | ------- | -------------------------------------- | -| `name` | unicode | Two-letter language code, e.g. `'en'`. | -| **RETURNS** | bool | Whether the class has been loaded. | +| Name | Type | Description | +| ----------- | ---- | -------------------------------------- | +| `name` | str | Two-letter language code, e.g. `'en'`. | +| **RETURNS** | bool | Whether the class has been loaded. | ### util.load_model {#util.load_model tag="function" new="2"} @@ -392,7 +392,7 @@ in via [`Language.from_disk()`](/api/language#from_disk). | Name | Type | Description | | ------------- | ---------- | -------------------------------------------------------- | -| `name` | unicode | Package name, shortcut link or model path. | +| `name` | str | Package name, shortcut link or model path. | | `**overrides` | - | Specific overrides, like pipeline components to disable. | | **RETURNS** | `Language` | `Language` class with the loaded model. | @@ -411,7 +411,7 @@ it easy to test a new model that you haven't packaged yet. | Name | Type | Description | | ------------- | ---------- | ---------------------------------------------------------------------------------------------------- | -| `model_path` | unicode | Path to model data directory. | +| `model_path` | str | Path to model data directory. | | `meta` | dict | Model meta data. If `False`, spaCy will try to load the meta from a meta.json in the same directory. | | `**overrides` | - | Specific overrides, like pipeline components to disable. | | **RETURNS** | `Language` | `Language` class with the loaded model. | @@ -432,7 +432,7 @@ A helper function to use in the `load()` method of a model package's | Name | Type | Description | | ------------- | ---------- | -------------------------------------------------------- | -| `init_file` | unicode | Path to model's `__init__.py`, i.e. `__file__`. | +| `init_file` | str | Path to model's `__init__.py`, i.e. `__file__`. | | `**overrides` | - | Specific overrides, like pipeline components to disable. | | **RETURNS** | `Language` | `Language` class with the loaded model. | @@ -446,10 +446,10 @@ Get a model's meta.json from a directory path and validate its contents. > meta = util.get_model_meta("/path/to/model") > ``` -| Name | Type | Description | -| ----------- | ---------------- | ------------------------ | -| `path` | unicode / `Path` | Path to model directory. | -| **RETURNS** | dict | The model's meta data. | +| Name | Type | Description | +| ----------- | ------------ | ------------------------ | +| `path` | str / `Path` | Path to model directory. | +| **RETURNS** | dict | The model's meta data. | ### util.is_package {#util.is_package tag="function"} @@ -463,10 +463,10 @@ Check if string maps to a package installed via pip. Mainly used to validate > util.is_package("xyz") # False > ``` -| Name | Type | Description | -| ----------- | ------- | -------------------------------------------- | -| `name` | unicode | Name of package. | -| **RETURNS** | `bool` | `True` if installed package, `False` if not. | +| Name | Type | Description | +| ----------- | ------ | -------------------------------------------- | +| `name` | str | Name of package. | +| **RETURNS** | `bool` | `True` if installed package, `False` if not. | ### util.get_package_path {#util.get_package_path tag="function" new="2"} @@ -480,10 +480,10 @@ Get path to an installed package. Mainly used to resolve the location of > # /usr/lib/python3.6/site-packages/en_core_web_sm > ``` -| Name | Type | Description | -| -------------- | ------- | -------------------------------- | -| `package_name` | unicode | Name of installed package. | -| **RETURNS** | `Path` | Path to model package directory. | +| Name | Type | Description | +| -------------- | ------ | -------------------------------- | +| `package_name` | str | Name of installed package. | +| **RETURNS** | `Path` | Path to model package directory. | ### util.is_in_jupyter {#util.is_in_jupyter tag="function" new="2"} diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index 93e747c1e..d4c0269ef 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -35,7 +35,7 @@ you can add vectors to later. | `data` | `ndarray[ndim=1, dtype='float32']` | The vector data. | | `keys` | iterable | A sequence of keys aligned with the data. | | `shape` | tuple | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. | -| `name` | unicode | A name to identify the vectors table. | +| `name` | str | A name to identify the vectors table. | | **RETURNS** | `Vectors` | The newly created object. | ## Vectors.\_\_getitem\_\_ {#getitem tag="method"} @@ -140,7 +140,7 @@ mapping separately. If you need to manage the strings, you should use the | Name | Type | Description | | ----------- | ---------------------------------- | ----------------------------------------------------- | -| `key` | unicode / int | The key to add. | +| `key` | str / int | The key to add. | | `vector` | `ndarray[ndim=1, dtype='float32']` | An optional vector to add for the key. | | `row` | int | An optional row number of a vector to map the key to. | | **RETURNS** | int | The row the vector was added to. | @@ -227,7 +227,7 @@ Look up one or more keys by row, or vice versa. | Name | Type | Description | | ----------- | ------------------------------------- | ------------------------------------------------------------------------ | -| `key` | unicode / int | Find the row that the given key points to. Returns int, `-1` if missing. | +| `key` | str / int | Find the row that the given key points to. Returns int, `-1` if missing. | | `keys` | iterable | Find rows that the keys point to. Returns `ndarray`. | | `row` | int | Find the first key that points to the row. Returns int. | | `rows` | iterable | Find the keys that point to the rows. Returns ndarray. | @@ -337,9 +337,9 @@ Save the current state to a directory. > > ``` -| Name | Type | Description | -| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | ## Vectors.from_disk {#from_disk tag="method"} @@ -352,10 +352,10 @@ Loads state from a directory. Modifies the object in place and returns it. > vectors.from_disk("/path/to/vectors") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| **RETURNS** | `Vectors` | The modified `Vectors` object. | +| Name | Type | Description | +| ----------- | ------------ | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| **RETURNS** | `Vectors` | The modified `Vectors` object. | ## Vectors.to_bytes {#to_bytes tag="method"} diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 685619c88..420e8263a 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -327,11 +327,11 @@ displaCy in our [online demo](https://explosion.ai/demos/displacy).. ### Disabling the parser {#disabling} In the [default models](/models), the parser is loaded and enabled as part of -the [standard processing pipeline](/usage/processing-pipelines). If you don't need -any of the syntactic information, you should disable the parser. Disabling the -parser will make spaCy load and run much faster. If you want to load the parser, -but need to disable it for specific documents, you can also control its use on -the `nlp` object. +the [standard processing pipeline](/usage/processing-pipelines). If you don't +need any of the syntactic information, you should disable the parser. Disabling +the parser will make spaCy load and run much faster. If you want to load the +parser, but need to disable it for specific documents, you can also control its +use on the `nlp` object. ```python nlp = spacy.load("en_core_web_sm", disable=["parser"]) @@ -990,10 +990,10 @@ nlp = spacy.load("en_core_web_sm") nlp.tokenizer = my_tokenizer ``` -| Argument | Type | Description | -| ----------- | ------- | ------------------------- | -| `text` | unicode | The raw text to tokenize. | -| **RETURNS** | `Doc` | The tokenized document. | +| Argument | Type | Description | +| ----------- | ----- | ------------------------- | +| `text` | str | The raw text to tokenize. | +| **RETURNS** | `Doc` | The tokenized document. | diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 696e11106..e7aca3981 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -272,16 +272,16 @@ doc = nlp("I won't have named entities") disabled.restore() ``` -If you want to disable all pipes except for one or a few, you can use the `enable` -keyword. Just like the `disable` keyword, it takes a list of pipe names, or a string -defining just one pipe. +If you want to disable all pipes except for one or a few, you can use the +`enable` keyword. Just like the `disable` keyword, it takes a list of pipe +names, or a string defining just one pipe. + ```python # Enable only the parser with nlp.select_pipes(enable="parser"): doc = nlp("I will only be parsed") ``` - Finally, you can also use the [`remove_pipe`](/api/language#remove_pipe) method to remove pipeline components from an existing pipeline, the [`rename_pipe`](/api/language#rename_pipe) method to rename them, or the @@ -349,12 +349,12 @@ last** in the pipeline, or define a **custom name**. If no name is set and no > nlp.add_pipe(my_component, before="parser") > ``` -| Argument | Type | Description | -| -------- | ------- | ------------------------------------------------------------------------ | -| `last` | bool | If set to `True`, component is added **last** in the pipeline (default). | -| `first` | bool | If set to `True`, component is added **first** in the pipeline. | -| `before` | unicode | String name of component to add the new component **before**. | -| `after` | unicode | String name of component to add the new component **after**. | +| Argument | Type | Description | +| -------- | ---- | ------------------------------------------------------------------------ | +| `last` | bool | If set to `True`, component is added **last** in the pipeline (default). | +| `first` | bool | If set to `True`, component is added **first** in the pipeline. | +| `before` | str | String name of component to add the new component **before**. | +| `after` | str | String name of component to add the new component **after**. | ### Example: A simple pipeline component {#custom-components-simple} diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index 058204a5d..588782986 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -94,8 +94,8 @@ docs = list(doc_bin.get_docs(nlp.vocab)) If `store_user_data` is set to `True`, the `Doc.user_data` will be serialized as well, which includes the values of -[extension attributes](/usage/processing-pipelines#custom-components-attributes) (if -they're serializable with msgpack). +[extension attributes](/usage/processing-pipelines#custom-components-attributes) +(if they're serializable with msgpack). @@ -666,10 +666,10 @@ and lets you customize how the model should be initialized and loaded. You can define the language data to be loaded and the [processing pipeline](/usage/processing-pipelines) to execute. -| Setting | Type | Description | -| ---------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `lang` | unicode | ID of the language class to initialize. | -| `pipeline` | list | A list of strings mapping to the IDs of pipeline factories to apply in that order. If not set, spaCy's [default pipeline](/usage/processing-pipelines) will be used. | +| Setting | Type | Description | +| ---------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `lang` | str | ID of the language class to initialize. | +| `pipeline` | list | A list of strings mapping to the IDs of pipeline factories to apply in that order. If not set, spaCy's [default pipeline](/usage/processing-pipelines) will be used. | The `load()` method that comes with our model package templates will take care of putting all this together and returning a `Language` object with the loaded diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md index dd0b0eb50..9733e09c2 100644 --- a/website/docs/usage/visualizers.md +++ b/website/docs/usage/visualizers.md @@ -67,12 +67,12 @@ arcs. -| Argument | Type | Description | Default | -| --------- | ------- | ----------------------------------------------------------- | ----------- | -| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | -| `color` | unicode | Text color (HEX, RGB or color names). | `"#000000"` | -| `bg` | unicode | Background color (HEX, RGB or color names). | `"#ffffff"` | -| `font` | unicode | Font name or font family for all text. | `"Arial"` | +| Argument | Type | Description | Default | +| --------- | ---- | ----------------------------------------------------------- | ----------- | +| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | +| `color` | str | Text color (HEX, RGB or color names). | `"#000000"` | +| `bg` | str | Background color (HEX, RGB or color names). | `"#ffffff"` | +| `font` | str | Font name or font family for all text. | `"Arial"` | For a list of all available options, see the [`displacy` API documentation](/api/top-level#displacy_options). From 1a15896ba9bcb2b12113880929edfb4fdf0683ff Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 24 May 2020 18:51:10 +0200 Subject: [PATCH 20/39] unicode -> str consistency [ci skip] --- spacy/cli/info.py | 2 +- spacy/displacy/render.py | 2 +- spacy/gold.pyx | 4 ++-- spacy/language.py | 4 ++-- spacy/matcher/dependencymatcher.pyx | 2 +- spacy/matcher/matcher.pyx | 2 +- spacy/matcher/phrasematcher.pyx | 2 +- spacy/pipeline/entityruler.py | 2 +- spacy/strings.pyx | 6 +++--- spacy/tokenizer.pyx | 4 ++-- spacy/tokens/doc.pyx | 4 ++-- spacy/util.py | 8 ++++---- spacy/vocab.pyx | 10 +++++----- 13 files changed, 26 insertions(+), 26 deletions(-) diff --git a/spacy/cli/info.py b/spacy/cli/info.py index d779eb2b3..98fd5cabf 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -65,7 +65,7 @@ def print_markdown(data, title=None): """Print data in GitHub-flavoured Markdown format for issues etc. data (dict or list of tuples): Label/value pairs. - title (unicode or None): Title, will be rendered as headline 2. + title (str / None): Title, will be rendered as headline 2. """ markdown = [] for key, value in data.items(): diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index d3572ce78..ef8632cbc 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -302,7 +302,7 @@ class EntityRenderer(object): text (str): Original text. spans (list): Individual entity spans and their start, end and label. - title (unicode or None): Document title set in Doc.user_data['title']. + title (str / None): Document title set in Doc.user_data['title']. """ markup = "" offset = 0 diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 1864b7a04..ecbd13354 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -140,8 +140,8 @@ class GoldCorpus(object): def __init__(self, train, dev, gold_preproc=False, limit=None): """Create a GoldCorpus. - train (unicode or Path): File or directory of training data. - dev (unicode or Path): File or directory of development data. + train (str / Path): File or directory of training data. + dev (str / Path): File or directory of development data. RETURNS (GoldCorpus): The newly created object. """ self.limit = limit diff --git a/spacy/language.py b/spacy/language.py index e3b770723..551b8c9af 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -934,7 +934,7 @@ class Language(object): """Save the current state to a directory. If a model is loaded, this will include the model. - path (unicode or Path): Path to a directory, which will be created if + path (str / Path): Path to a directory, which will be created if it doesn't exist. exclude (list): Names of components or serialization fields to exclude. @@ -968,7 +968,7 @@ class Language(object): returns it. If the saved `Language` object contains a model, the model will be loaded. - path (unicode or Path): A path to a directory. + path (str / Path): A path to a directory. exclude (list): Names of components or serialization fields to exclude. RETURNS (Language): The modified `Language` object. diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 732931380..ddeeedd06 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -194,7 +194,7 @@ cdef class DependencyMatcher: def get(self, key, default=None): """Retrieve the pattern stored for a key. - key (unicode or int): The key to retrieve. + key (str / int): The key to retrieve. RETURNS (tuple): The rule, as an (on_match, patterns) tuple. """ key = self._normalize_key(key) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 225eba9a9..868465b8d 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -165,7 +165,7 @@ cdef class Matcher: def get(self, key, default=None): """Retrieve the pattern stored for a key. - key (unicode or int): The key to retrieve. + key (str / int): The key to retrieve. RETURNS (tuple): The rule, as an (on_match, patterns) tuple. """ key = self._normalize_key(key) diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index f7ce44ece..aa4534296 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -30,7 +30,7 @@ cdef class PhraseMatcher: """Initialize the PhraseMatcher. vocab (Vocab): The shared vocabulary. - attr (int / unicode): Token attribute to match on. + attr (int / str): Token attribute to match on. validate (bool): Perform additional validation when patterns are added. RETURNS (PhraseMatcher): The newly constructed object. diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index cdacc82f6..bdc009192 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -30,7 +30,7 @@ class EntityRuler(object): nlp (Language): The shared nlp object to pass the vocab to the matchers and process phrase patterns. - phrase_matcher_attr (int / unicode): Token attribute to match on, passed + phrase_matcher_attr (int / str): Token attribute to match on, passed to the internal PhraseMatcher as `attr` validate (bool): Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate` diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 9fe5af154..9e584ce8a 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -109,7 +109,7 @@ cdef class StringStore: """Retrieve a string from a given hash, or vice versa. string_or_id (bytes, unicode or uint64): The value to encode. - Returns (unicode or uint64): The value to be retrieved. + Returns (str / uint64): The value to be retrieved. """ if isinstance(string_or_id, basestring) and len(string_or_id) == 0: return 0 @@ -223,7 +223,7 @@ cdef class StringStore: def to_disk(self, path): """Save the current state to a directory. - path (unicode or Path): A path to a directory, which will be created if + path (str / Path): A path to a directory, which will be created if it doesn't exist. Paths may be either strings or Path-like objects. """ path = util.ensure_path(path) @@ -234,7 +234,7 @@ cdef class StringStore: """Loads state from a directory. Modifies the object in place and returns it. - path (unicode or Path): A path to a directory. Paths may be either + path (str / Path): A path to a directory. Paths may be either strings or `Path`-like objects. RETURNS (StringStore): The modified `StringStore` object. """ diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index b628b1171..538bf60e9 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -693,7 +693,7 @@ cdef class Tokenizer: def to_disk(self, path, **kwargs): """Save the current state to a directory. - path (unicode or Path): A path to a directory, which will be created if + path (str / Path): A path to a directory, which will be created if it doesn't exist. exclude (list): String names of serialization fields to exclude. @@ -707,7 +707,7 @@ cdef class Tokenizer: """Loads state from a directory. Modifies the object in place and returns it. - path (unicode or Path): A path to a directory. + path (str / Path): A path to a directory. exclude (list): String names of serialization fields to exclude. RETURNS (Tokenizer): The modified `Tokenizer` object. diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index f6d0dbf4a..31c1e8c82 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -843,7 +843,7 @@ cdef class Doc: def to_disk(self, path, **kwargs): """Save the current state to a directory. - path (unicode or Path): A path to a directory, which will be created if + path (str / Path): A path to a directory, which will be created if it doesn't exist. Paths may be either strings or Path-like objects. exclude (list): String names of serialization fields to exclude. @@ -857,7 +857,7 @@ cdef class Doc: """Loads state from a directory. Modifies the object in place and returns it. - path (unicode or Path): A path to a directory. Paths may be either + path (str / Path): A path to a directory. Paths may be either strings or `Path`-like objects. exclude (list): String names of serialization fields to exclude. RETURNS (Doc): The modified `Doc` object. diff --git a/spacy/util.py b/spacy/util.py index fc5837755..b614c29c7 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -269,7 +269,7 @@ def load_config(path, create_objects=False): """Load a Thinc-formatted config file, optionally filling in objects where the config references registry entries. See "Thinc config files" for details. - path (unicode or Path): Path to the config file + path (str / Path): Path to the config file create_objects (bool): Whether to automatically create objects when the config references registry entries. Defaults to False. @@ -286,7 +286,7 @@ def load_config_from_str(string, create_objects=False): """Load a Thinc-formatted config, optionally filling in objects where the config references registry entries. See "Thinc config files" for details. - string (unicode or Path): Text contents of the config file. + string (str / Path): Text contents of the config file. create_objects (bool): Whether to automatically create objects when the config references registry entries. Defaults to False. @@ -302,7 +302,7 @@ def load_config_from_str(string, create_objects=False): def get_model_meta(path): """Get model meta.json from a directory path and validate its contents. - path (unicode or Path): Path to model directory. + path (str / Path): Path to model directory. RETURNS (dict): The model's meta data. """ model_path = ensure_path(path) @@ -321,7 +321,7 @@ def get_model_meta(path): def get_model_config(path): """Get the model's config from a directory path. - path (unicode or Path): Path to model directory. + path (str / Path): Path to model directory. RETURNS (Config): The model's config data. """ model_path = ensure_path(path) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index ed37f6e98..3a82ab72d 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -336,7 +336,7 @@ cdef class Vocab: If `minn` is defined, then the resulting vector uses Fasttext's subword features by average over ngrams of `orth`. - orth (int / unicode): The hash value of a word, or its unicode string. + orth (int / str): The hash value of a word, or its unicode string. minn (int): Minimum n-gram length used for Fasttext's ngram computation. Defaults to the length of `orth`. maxn (int): Maximum n-gram length used for Fasttext's ngram computation. @@ -389,7 +389,7 @@ cdef class Vocab: """Set a vector for a word in the vocabulary. Words can be referenced by string or int ID. - orth (int / unicode): The word. + orth (int / str): The word. vector (numpy.ndarray[ndim=1, dtype='float32']): The vector to set. DOCS: https://spacy.io/api/vocab#set_vector @@ -411,7 +411,7 @@ cdef class Vocab: """Check whether a word has a vector. Returns False if no vectors have been loaded. Words can be looked up by string or int ID. - orth (int / unicode): The word. + orth (int / str): The word. RETURNS (bool): Whether the word has a vector. DOCS: https://spacy.io/api/vocab#has_vector @@ -423,7 +423,7 @@ cdef class Vocab: def to_disk(self, path, exclude=tuple(), **kwargs): """Save the current state to a directory. - path (unicode or Path): A path to a directory, which will be created if + path (str / Path): A path to a directory, which will be created if it doesn't exist. exclude (list): String names of serialization fields to exclude. @@ -448,7 +448,7 @@ cdef class Vocab: """Loads state from a directory. Modifies the object in place and returns it. - path (unicode or Path): A path to a directory. + path (str / Path): A path to a directory. exclude (list): String names of serialization fields to exclude. RETURNS (Vocab): The modified `Vocab` object. From 4fd087572a1c597781fef8ca4fbcfebed825c0fb Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 28 May 2020 12:51:37 +0200 Subject: [PATCH 21/39] WIP: improve model version deps --- spacy/cli/package.py | 2 +- spacy/util.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index cf93c872f..15ae2033c 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -138,7 +138,7 @@ def list_files(data_dir): def list_requirements(meta): parent_package = meta.get('parent_package', 'spacy') - requirements = [parent_package + meta['spacy_version']] + requirements = [parent_package + '>=' + meta['spacy_version']] if 'setup_requires' in meta: requirements += meta['setup_requires'] if 'requirements' in meta: diff --git a/spacy/util.py b/spacy/util.py index b614c29c7..4e468ef9d 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -265,6 +265,15 @@ def is_compatible_model(meta): return True +def get_model_version_range(version): + """Generate a version range like >=1.2.3,<1.3.0 based on a given spaCy + version. Models are always compatible across patch versions but not + across minor or major versions. + """ + major, minor = split_version(version) + return f">={version},<{major}.{minor + 1}.0" + + def load_config(path, create_objects=False): """Load a Thinc-formatted config file, optionally filling in objects where the config references registry entries. See "Thinc config files" for details. From bed62991add4ff12282a00dd1d321441878b27ef Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 30 May 2020 14:59:55 +0200 Subject: [PATCH 22/39] Tidy up requirements --- requirements.txt | 5 ++++- setup.cfg | 7 ++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index add083a05..a104b68ba 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,8 +13,11 @@ numpy>=1.15.0 requests>=2.13.0,<3.0.0 plac>=0.9.6,<1.2.0 tqdm>=4.38.0,<5.0.0 -importlib_metadata>=0.20; python_version < "3.8" pydantic>=1.3.0,<2.0.0 +# Official Python utilities +setuptools +packaging +importlib_metadata>=0.20; python_version < "3.8" # Development dependencies cython>=0.25 pytest>=4.6.5 diff --git a/setup.cfg b/setup.cfg index eb7608c4e..ae09d071c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,15 +47,16 @@ install_requires = wasabi>=0.4.0,<1.1.0 srsly>=2.0.0,<3.0.0 catalogue>=0.0.7,<1.1.0 - ml_datasets + ml_datasets>=0.1.1 # Third-party dependencies tqdm>=4.38.0,<5.0.0 - setuptools numpy>=1.15.0 plac>=0.9.6,<1.2.0 requests>=2.13.0,<3.0.0 pydantic>=1.3.0,<2.0.0 - tqdm>=4.38.0,<5.0.0 + # Official Python utilities + setuptools + packaging importlib_metadata>=0.20; python_version < "3.8" [options.extras_require] From e47e5a4b10e0d3c5b6fed255040cebc019173e39 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 30 May 2020 15:01:58 +0200 Subject: [PATCH 23/39] Use more sophisticated version parsing logic --- spacy/cli/download.py | 7 +++--- spacy/cli/package.py | 4 ++-- spacy/cli/validate.py | 9 ++++---- spacy/language.py | 3 ++- spacy/tests/test_misc.py | 12 ++++++++-- spacy/util.py | 49 ++++++++++++++++------------------------ 6 files changed, 41 insertions(+), 43 deletions(-) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index af132bbbe..3d56822a5 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -5,7 +5,7 @@ import sys from wasabi import msg from .. import about -from ..util import is_package +from ..util import is_package, get_base_version def download( @@ -63,8 +63,7 @@ def get_json(url, desc): def get_compatibility(): - version = about.__version__ - version = version.rsplit(".dev", 1)[0] + version = get_base_version(about.__version__) comp_table = get_json(about.__compatibility__, "compatibility table") comp = comp_table["spacy"] if version not in comp: @@ -73,7 +72,7 @@ def get_compatibility(): def get_version(model, comp): - model = model.rsplit(".dev", 1)[0] + model = get_base_version(model) if model not in comp: msg.fail( f"No compatible model found for '{model}' (spaCy v{about.__version__})", diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 15ae2033c..153e61ba3 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -90,7 +90,7 @@ def generate_meta(model_path, existing_meta, msg): ("license", "License", meta.get("license", "MIT")), ] nlp = util.load_model_from_path(Path(model_path)) - meta["spacy_version"] = about.__version__ + meta["spacy_version"] = util.get_model_version_range(about.__version__) meta["pipeline"] = nlp.pipe_names meta["vectors"] = { "width": nlp.vocab.vectors_length, @@ -138,7 +138,7 @@ def list_files(data_dir): def list_requirements(meta): parent_package = meta.get('parent_package', 'spacy') - requirements = [parent_package + '>=' + meta['spacy_version']] + requirements = [parent_package + meta['spacy_version']] if 'setup_requires' in meta: requirements += meta['setup_requires'] if 'requirements' in meta: diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index c39cadc7b..3c49abb3e 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -4,7 +4,7 @@ import requests from wasabi import msg from .. import about -from ..util import get_package_version, get_installed_models, split_version +from ..util import get_package_version, get_installed_models, get_base_version from ..util import get_package_path, get_model_meta, is_compatible_model @@ -14,7 +14,7 @@ def validate(): with the installed models. Should be run after `pip install -U spacy`. """ model_pkgs, compat = get_model_pkgs() - spacy_version = about.__version__.rsplit(".dev", 1)[0] + spacy_version = get_base_version(about.__version__) current_compat = compat.get(spacy_version, {}) if not current_compat: msg.warn(f"No compatible models found for v{spacy_version} of spaCy") @@ -78,13 +78,12 @@ def get_model_pkgs(): version = get_package_version(pkg_name) if package in compat: is_compat = version in compat[package] - v_maj, v_min = split_version(about.__version__) - spacy_version = f"{v_maj}.{v_min}" + spacy_version = about.__version__ else: model_path = get_package_path(package) model_meta = get_model_meta(model_path) - is_compat = is_compatible_model(model_meta) spacy_version = model_meta.get("spacy_version", "n/a") + is_compat = is_compatible_model(spacy_version) pkgs[pkg_name] = { "name": package, "version": version, diff --git a/spacy/language.py b/spacy/language.py index 551b8c9af..61d69b63e 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -191,13 +191,14 @@ class Language(object): @property def meta(self): + spacy_version = util.get_model_version_range(about.__version__) if self.vocab.lang: self._meta.setdefault("lang", self.vocab.lang) else: self._meta.setdefault("lang", self.lang) self._meta.setdefault("name", "model") self._meta.setdefault("version", "0.0.0") - self._meta.setdefault("spacy_version", about.__version__) + self._meta.setdefault("spacy_version", spacy_version) self._meta.setdefault("description", "") self._meta.setdefault("author", "") self._meta.setdefault("email", "") diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 9e67ae83b..9aa95c431 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -95,7 +95,15 @@ def test_ascii_filenames(): @pytest.mark.parametrize( "version,compatible", - [(spacy_version, True), ("2.0.0", False), (">=1.2.3,<4.5.6", False)], + [ + (spacy_version, True), + (f">={spacy_version}", True), + ("2.0.0", False), + (">=2.0.0", True), + (">=1.0.0,<2.1.1", False), + (">=1.2.3,<4.5.6", True), + ("n/a", None), + ], ) def test_is_compatible_model(version, compatible): - assert util.is_compatible_model({"spacy_version": version}) is compatible + assert util.is_compatible_model(version) is compatible diff --git a/spacy/util.py b/spacy/util.py index 4e468ef9d..835e46fc6 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -14,6 +14,8 @@ import srsly import catalogue import sys import warnings +from packaging.specifiers import SpecifierSet, InvalidSpecifier +from packaging.version import Version, InvalidVersion try: @@ -236,42 +238,31 @@ def get_package_version(name): return None -def split_version(version): - """RETURNS (tuple): Two integers, the major and minor spaCy version.""" - pieces = version.split(".", 3) - return int(pieces[0]), int(pieces[1]) - - -def is_compatible_model(meta): - """Check if a model is compatible with the current version of spaCy, based - on its meta.json. We compare the version of spaCy the model was created with - with the current version. If the minor version is different, it's considered - incompatible. - - meta (dict): The model's meta. - RETURNS (bool / None): Whether the model is compatible with the current - spaCy or None if we don't have enough info. - """ - cur_v = about.__version__ - pkg_v = meta.get("spacy_version") - if not pkg_v or not isinstance(pkg_v, str): +def is_compatible_model(constraint): + version = Version(about.__version__) + if constraint[0].isdigit(): + # Handle cases where exact version is provided as constraint + constraint = f"=={constraint}" + try: + spec = SpecifierSet(constraint) + except InvalidSpecifier: return None - # Handle spacy_version values like >=x,=1.2.3,<1.3.0 based on a given spaCy version. Models are always compatible across patch versions but not across minor or major versions. """ - major, minor = split_version(version) - return f">={version},<{major}.{minor + 1}.0" + release = Version(spacy_version).release + return f">={spacy_version},<{release[0]}.{release[1] + 1}.0" + + +def get_base_version(version): + return Version(version).base_version def load_config(path, create_objects=False): From a7e370bcbfd4234b53061a004c0b588e3ec76c06 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 30 May 2020 15:03:18 +0200 Subject: [PATCH 24/39] Don't override spaCy version --- spacy/cli/train.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index c205fa5b2..590ce4f13 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -467,7 +467,6 @@ def train( # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names - meta["spacy_version"] = about.__version__ if beam_width == 1: meta["speed"] = { "nwords": nwords, From b7aff6020c34ecae3bb0891b469193d8772b8197 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 30 May 2020 15:18:53 +0200 Subject: [PATCH 25/39] Make functions more general purpose and update docstrings and tests --- spacy/cli/validate.py | 4 ++-- spacy/tests/test_misc.py | 22 ++++++++++++---------- spacy/util.py | 27 +++++++++++++++++++++------ 3 files changed, 35 insertions(+), 18 deletions(-) diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index 3c49abb3e..080cd77e2 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -5,7 +5,7 @@ from wasabi import msg from .. import about from ..util import get_package_version, get_installed_models, get_base_version -from ..util import get_package_path, get_model_meta, is_compatible_model +from ..util import get_package_path, get_model_meta, is_compatible_version def validate(): @@ -83,7 +83,7 @@ def get_model_pkgs(): model_path = get_package_path(package) model_meta = get_model_meta(model_path) spacy_version = model_meta.get("spacy_version", "n/a") - is_compat = is_compatible_model(spacy_version) + is_compat = is_compatible_version(about.__version__, spacy_version) pkgs[pkg_name] = { "name": package, "version": version, diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 9aa95c431..e4b4e570c 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -94,16 +94,18 @@ def test_ascii_filenames(): @pytest.mark.parametrize( - "version,compatible", + "version,constraint,compatible", [ - (spacy_version, True), - (f">={spacy_version}", True), - ("2.0.0", False), - (">=2.0.0", True), - (">=1.0.0,<2.1.1", False), - (">=1.2.3,<4.5.6", True), - ("n/a", None), + (spacy_version, spacy_version, True), + (spacy_version, f">={spacy_version}", True), + ("3.0.0", "2.0.0", False), + ("3.2.1", ">=2.0.0", True), + ("2.2.10a1", ">=1.0.0,<2.1.1", False), + ("3.0.0.dev3", ">=1.2.3,<4.5.6", True), + ("n/a", ">=1.2.3,<4.5.6", None), + ("1.2.3", "n/a", None), + ("n/a", "n/a", None), ], ) -def test_is_compatible_model(version, compatible): - assert util.is_compatible_model(version) is compatible +def test_is_compatible_version(version, constraint, compatible): + assert util.is_compatible_version(version, constraint) is compatible diff --git a/spacy/util.py b/spacy/util.py index 835e46fc6..741b289c1 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -238,17 +238,27 @@ def get_package_version(name): return None -def is_compatible_model(constraint): - version = Version(about.__version__) +def is_compatible_version(version, constraint, prereleases=True): + """Check if a version (e.g. "2.0.0") is compatible given a version + constraint (e.g. ">=1.9.0,<2.2.1"). If the constraint is a specific version, + it's interpreted as =={version}. + + version (str): The version to check. + constraint (str): The constraint string. + prereleases (bool): Whether to allow prereleases. If set to False, + prerelease versions will be considered incompatible. + RETURNS (bool / None): Whether the version is compatible, or None if the + version or constraint are invalid. + """ + # Handle cases where exact version is provided as constraint if constraint[0].isdigit(): - # Handle cases where exact version is provided as constraint constraint = f"=={constraint}" try: spec = SpecifierSet(constraint) - except InvalidSpecifier: + version = Version(version) + except (InvalidSpecifier, InvalidVersion): return None - # Allow prereleases and dev versions - spec.prereleases = True + spec.prereleases = prereleases return version in spec @@ -262,6 +272,11 @@ def get_model_version_range(spacy_version): def get_base_version(version): + """Generate the base version without any prerelease identifiers. + + version (str): The version, e.g. "3.0.0.dev1". + RETURNS (str): The base version, e.g. "3.0.0". + """ return Version(version).base_version From 368182776e61f6582223c02cf31b5eee65521d20 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 30 May 2020 15:19:53 +0200 Subject: [PATCH 26/39] Tidy up dependencies --- setup.cfg | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index eb7608c4e..c5c39b447 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,7 +47,7 @@ install_requires = wasabi>=0.4.0,<1.1.0 srsly>=2.0.0,<3.0.0 catalogue>=0.0.7,<1.1.0 - ml_datasets + ml_datasets>=0.1.1 # Third-party dependencies tqdm>=4.38.0,<5.0.0 setuptools @@ -55,7 +55,6 @@ install_requires = plac>=0.9.6,<1.2.0 requests>=2.13.0,<3.0.0 pydantic>=1.3.0,<2.0.0 - tqdm>=4.38.0,<5.0.0 importlib_metadata>=0.20; python_version < "3.8" [options.extras_require] From dc186afdc5b7f42dd32eeafb239b3d5604b8fbbd Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 30 May 2020 15:34:54 +0200 Subject: [PATCH 27/39] Add warning --- spacy/errors.py | 6 ++++++ spacy/util.py | 10 ++++++++++ 2 files changed, 16 insertions(+) diff --git a/spacy/errors.py b/spacy/errors.py index 932bb1eff..da2cfdf04 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -104,6 +104,12 @@ class Warnings(object): "string \"Field1=Value1,Value2|Field2=Value3\".") # TODO: fix numbering after merging develop into master + W095 = ("Model '{model}' ({model_version}) requires spaCy {version} and is " + "incompatible with the current version ({current}). This may lead " + "to unexpected results or runtime errors. To resolve this, " + "download a newer compatible model or retrain your custom model " + "with the current spaCy version. For more details and available " + "updates, run: python -m spacy validate") W096 = ("The method 'disable_pipes' has become deprecated - use 'select_pipes' " "instead.") W097 = ("No Model config was provided to create the '{name}' component, " diff --git a/spacy/util.py b/spacy/util.py index 741b289c1..79134400c 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -330,6 +330,16 @@ def get_model_meta(path): for setting in ["lang", "name", "version"]: if setting not in meta or not meta[setting]: raise ValueError(Errors.E054.format(setting=setting)) + if "spacy_version" in meta: + if not is_compatible_version(about.__version__, meta["spacy_version"]): + warnings.warn( + Warnings.W095.format( + model=f"{meta['lang']}_{meta['name']}", + model_version=meta["version"], + version=meta["spacy_version"], + current=about.__version__, + ) + ) return meta From cd5f748e0982524167e55884a7b1677a63b5b308 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Sat, 30 May 2020 20:27:47 +0200 Subject: [PATCH 28/39] Add onto-joint experiment file --- examples/experiments/onto-joint/defaults.cfg | 115 +++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 examples/experiments/onto-joint/defaults.cfg diff --git a/examples/experiments/onto-joint/defaults.cfg b/examples/experiments/onto-joint/defaults.cfg new file mode 100644 index 000000000..fbac4ea7d --- /dev/null +++ b/examples/experiments/onto-joint/defaults.cfg @@ -0,0 +1,115 @@ +# Training hyper-parameters and additional features. +[training] +# Whether to train on sequences with 'gold standard' sentence boundaries +# and tokens. If you set this to true, take care to ensure your run-time +# data is passed in sentence-by-sentence via some prior preprocessing. +gold_preproc = false +# Limitations on training document length or number of examples. +max_length = 0 +limit = 0 +# Data augmentation +orth_variant_level = 0.0 +dropout = 0.1 +# Controls early-stopping. 0 or -1 mean unlimited. +patience = 1600 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 400 +# Other settings +seed = 0 +accumulate_gradient = 1 +use_pytorch_for_gpu_memory = false +# Control how scores are printed and checkpoints are evaluated. +scores = ["speed", "tags_acc", "uas", "las", "ents_f"] +score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2} +# These settings are invalid for the transformer models. +init_tok2vec = null +vectors = null + +[training.batch_size] +@schedules = "compounding.v1" +start = 1000 +stop = 1000 +compound = 1.001 + +[optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 1e-8 +learn_rate = 0.001 + +#[optimizer.learn_rate] +#@schedules = "warmup_linear.v1" +#warmup_steps = 250 +#total_steps = 20000 +#initial_rate = 0.001 + +[nlp] +lang = "en" +vectors = ${training:vectors} + +[nlp.pipeline.tok2vec] +factory = "tok2vec" + +[nlp.pipeline.senter] +factory = "senter" + +[nlp.pipeline.ner] +factory = "ner" + +[nlp.pipeline.tagger] +factory = "tagger" + +[nlp.pipeline.parser] +factory = "parser" + +[nlp.pipeline.senter.model] +@architectures = "spacy.Tagger.v1" + +[nlp.pipeline.senter.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.tagger.model] +@architectures = "spacy.Tagger.v1" + +[nlp.pipeline.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.parser.model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 8 +hidden_width = 128 +maxout_pieces = 3 +use_upper = false + +[nlp.pipeline.parser.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.ner.model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 3 +hidden_width = 128 +maxout_pieces = 3 +use_upper = false + +[nlp.pipeline.ner.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.tok2vec.model] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = ${nlp:vectors} +width = 256 +depth = 6 +window_size = 1 +embed_size = 10000 +maxout_pieces = 3 +subword_features = true From ec52e7f886ad3839bb509c38707a8ae4e955b7d4 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Jun 2020 13:21:55 +0200 Subject: [PATCH 29/39] add oversize examples before StopIteration returns --- spacy/util.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/util.py b/spacy/util.py index 79134400c..54ecb6edd 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -681,6 +681,9 @@ def minibatch_by_words(examples, size, tuples=True, count_words=len, tolerance=0 try: example = next(examples) except StopIteration: + if oversize: + example = oversize.pop(0) + batch.append(example) if batch: yield batch return From fdfd82293688678b1590d680f758c32da3c83d73 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Jun 2020 15:22:54 +0200 Subject: [PATCH 30/39] rewrite minibatch_by_words function --- spacy/util.py | 60 ++++++++++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index 54ecb6edd..0f8de3ddf 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -656,45 +656,47 @@ def decaying(start, stop, decay): curr -= decay -def minibatch_by_words(examples, size, tuples=True, count_words=len, tolerance=0.2): +def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_oversize=False): """Create minibatches of roughly a given number of words. If any examples are longer than the specified batch length, they will appear in a batch by - themselves.""" + themselves, or be discarded if discard_oversize=True.""" if isinstance(size, int): size_ = itertools.repeat(size) elif isinstance(size, List): size_ = iter(size) else: size_ = size - examples = iter(examples) - oversize = [] - while True: - batch_size = next(size_) - tol_size = batch_size * 0.2 - batch = [] - if oversize: - example = oversize.pop(0) - n_words = count_words(example.doc) + + target_size = next(size_) + tol_size = target_size * tolerance + batch = [] + current_size = 0 + + for example in examples: + n_words = count_words(example.doc) + # add the example to the current batch if it still fits + if (current_size + n_words) < (target_size + tol_size): batch.append(example) - batch_size -= n_words - while batch_size >= 1: - try: - example = next(examples) - except StopIteration: - if oversize: - example = oversize.pop(0) - batch.append(example) - if batch: - yield batch - return - n_words = count_words(example.doc) - if n_words < (batch_size + tol_size): - batch_size -= n_words - batch.append(example) + current_size += n_words + else: + # if the current example exceeds the batch size, it is returned separately + # but only if discard_oversize=False. + if current_size > target_size: + if not discard_oversize: + yield [example] + # yield the previous batch and start a new one else: - oversize.append(example) - if batch: - yield batch + yield batch + target_size = next(size_) + tol_size = target_size * tolerance + # In theory it may happen that the current example now exceeds the new target_size, + # but that seems like an unimportant edge case if batch sizes are variable anyway? + batch = [example] + current_size = n_words + + # yield the final batch + if batch: + yield batch def itershuffle(iterable, bufsize=1000): From 5b350a6c9998ccb53439f2721159ab92ca61003f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Jun 2020 17:49:33 +0200 Subject: [PATCH 31/39] bugfix of the bugfix --- spacy/util.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index 0f8de3ddf..f5ca49637 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -674,25 +674,26 @@ def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_o for example in examples: n_words = count_words(example.doc) + # if the current example exceeds the batch size, it is returned separately + # but only if discard_oversize=False. + if n_words > target_size: + if not discard_oversize: + yield [example] + # add the example to the current batch if it still fits - if (current_size + n_words) < (target_size + tol_size): + elif (current_size + n_words) < (target_size + tol_size): batch.append(example) current_size += n_words + + # yield the previous batch and start a new one else: - # if the current example exceeds the batch size, it is returned separately - # but only if discard_oversize=False. - if current_size > target_size: - if not discard_oversize: - yield [example] - # yield the previous batch and start a new one - else: - yield batch - target_size = next(size_) - tol_size = target_size * tolerance - # In theory it may happen that the current example now exceeds the new target_size, - # but that seems like an unimportant edge case if batch sizes are variable anyway? - batch = [example] - current_size = n_words + yield batch + target_size = next(size_) + tol_size = target_size * tolerance + # In theory it may happen that the current example now exceeds the new target_size, + # but that seems like an unimportant edge case if batch sizes are variable anyway? + batch = [example] + current_size = n_words # yield the final batch if batch: From 85b0597ed5f8e23de337f56966e4b342827a99c3 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Jun 2020 18:26:21 +0200 Subject: [PATCH 32/39] add test for minibatch util --- spacy/tests/test_util.py | 23 +++++++++++++++++++++++ spacy/tests/util.py | 7 +++++++ 2 files changed, 30 insertions(+) create mode 100644 spacy/tests/test_util.py diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py new file mode 100644 index 000000000..382a8f548 --- /dev/null +++ b/spacy/tests/test_util.py @@ -0,0 +1,23 @@ +import pytest +from spacy.gold import Example + +from .util import get_doc + +from spacy.util import minibatch_by_words + + +@pytest.mark.parametrize( + "doc_sizes, expected_batches", + [ + ([400, 400, 199], [3]), + ([400, 400, 199, 3], [4]), + ([400, 400, 199, 3, 250], [3, 2]), + ], +) +def test_util_minibatch(doc_sizes, expected_batches): + docs = [get_doc(doc_size) for doc_size in doc_sizes] + + examples = [Example(doc=doc) for doc in docs] + + batches = list(minibatch_by_words(examples=examples, size=1000)) + assert [len(batch) for batch in batches] == expected_batches diff --git a/spacy/tests/util.py b/spacy/tests/util.py index e29342268..73650a6f7 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -92,6 +92,13 @@ def get_batch(batch_size): return docs +def get_doc(n_words): + vocab = Vocab() + # Make the words numbers, so that they're easy to track. + numbers = [str(i) for i in range(0, n_words)] + return Doc(vocab, words=numbers) + + def apply_transition_sequence(parser, doc, sequence): """Perform a series of pre-specified transitions, to put the parser in a desired state.""" From 6651fafd5cad7edf34dfb1374c962dff6ce901e9 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Jun 2020 19:43:39 +0200 Subject: [PATCH 33/39] using overflow buffer for examples within the tolerance margin --- spacy/tests/test_util.py | 4 ++-- spacy/util.py | 17 ++++++++++++++--- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 382a8f548..93201eb4b 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -11,13 +11,13 @@ from spacy.util import minibatch_by_words [ ([400, 400, 199], [3]), ([400, 400, 199, 3], [4]), + ([400, 400, 199, 3, 1], [5]), ([400, 400, 199, 3, 250], [3, 2]), + ([400, 400, 199, 3, 1, 250], [3, 3]), ], ) def test_util_minibatch(doc_sizes, expected_batches): docs = [get_doc(doc_size) for doc_size in doc_sizes] - examples = [Example(doc=doc) for doc in docs] - batches = list(minibatch_by_words(examples=examples, size=1000)) assert [len(batch) for batch in batches] == expected_batches diff --git a/spacy/util.py b/spacy/util.py index f5ca49637..8ac2fd370 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -670,7 +670,9 @@ def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_o target_size = next(size_) tol_size = target_size * tolerance batch = [] + overflow = [] current_size = 0 + overflow_size = 0 for example in examples: n_words = count_words(example.doc) @@ -681,10 +683,15 @@ def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_o yield [example] # add the example to the current batch if it still fits - elif (current_size + n_words) < (target_size + tol_size): + elif (current_size + n_words) < target_size: batch.append(example) current_size += n_words + # add the example to the overflow buffer if it fits in the tolerance margins + elif (current_size + n_words) < (target_size + tol_size): + overflow.append(example) + overflow_size += n_words + # yield the previous batch and start a new one else: yield batch @@ -692,11 +699,15 @@ def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_o tol_size = target_size * tolerance # In theory it may happen that the current example now exceeds the new target_size, # but that seems like an unimportant edge case if batch sizes are variable anyway? - batch = [example] - current_size = n_words + batch = overflow + batch.append(example) + current_size = overflow_size + n_words + overflow = [] + overflow_size = 0 # yield the final batch if batch: + batch.extend(overflow) yield batch From 6208d322d383455ea91c1e30b2c834a08e2cbbf0 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Jun 2020 19:47:30 +0200 Subject: [PATCH 34/39] slightly more challenging unit test --- spacy/tests/test_util.py | 4 ++-- spacy/util.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 93201eb4b..a0c6ab6c0 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -12,8 +12,8 @@ from spacy.util import minibatch_by_words ([400, 400, 199], [3]), ([400, 400, 199, 3], [4]), ([400, 400, 199, 3, 1], [5]), - ([400, 400, 199, 3, 250], [3, 2]), - ([400, 400, 199, 3, 1, 250], [3, 3]), + ([400, 400, 199, 3, 200], [3, 2]), + ([400, 400, 199, 3, 1, 200], [3, 3]), ], ) def test_util_minibatch(doc_sizes, expected_batches): diff --git a/spacy/util.py b/spacy/util.py index 8ac2fd370..b4e6f7fb1 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -682,13 +682,13 @@ def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_o if not discard_oversize: yield [example] - # add the example to the current batch if it still fits - elif (current_size + n_words) < target_size: + # add the example to the current batch if it still fits and there's no overflow yet + elif overflow_size == 0 and (current_size + n_words) < target_size: batch.append(example) current_size += n_words # add the example to the overflow buffer if it fits in the tolerance margins - elif (current_size + n_words) < (target_size + tol_size): + elif (current_size + overflow_size + n_words) < (target_size + tol_size): overflow.append(example) overflow_size += n_words From ef834b4cd7f51d24b2df451b091caaf21586d199 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Jun 2020 19:50:44 +0200 Subject: [PATCH 35/39] fix comments --- spacy/util.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index b4e6f7fb1..3f7a96a19 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -682,23 +682,23 @@ def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_o if not discard_oversize: yield [example] - # add the example to the current batch if it still fits and there's no overflow yet + # add the example to the current batch if there's no overflow yet and it still fits elif overflow_size == 0 and (current_size + n_words) < target_size: batch.append(example) current_size += n_words - # add the example to the overflow buffer if it fits in the tolerance margins + # add the example to the overflow buffer if it fits in the tolerance margin elif (current_size + overflow_size + n_words) < (target_size + tol_size): overflow.append(example) overflow_size += n_words - # yield the previous batch and start a new one + # yield the previous batch and start a new one. The new one gets the overflow examples. else: yield batch target_size = next(size_) tol_size = target_size * tolerance - # In theory it may happen that the current example now exceeds the new target_size, - # but that seems like an unimportant edge case if batch sizes are variable anyway? + # In theory it may happen that the current example + overflow examples now exceed the new + # target_size, but that seems like an unimportant edge case if batch sizes are variable? batch = overflow batch.append(example) current_size = overflow_size + n_words From f2e162fc60dab95e16efbb7310e4745689cb886c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Jun 2020 19:59:04 +0200 Subject: [PATCH 36/39] it's only oversized if the tolerance level is also exceeded --- spacy/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index 3f7a96a19..598545b84 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -678,7 +678,7 @@ def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_o n_words = count_words(example.doc) # if the current example exceeds the batch size, it is returned separately # but only if discard_oversize=False. - if n_words > target_size: + if n_words > target_size + tol_size: if not discard_oversize: yield [example] From aa6271b16ca653d24010a5bf325fcc36ac757361 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Jun 2020 22:05:08 +0200 Subject: [PATCH 37/39] extending algorithm to deal better with edge cases --- spacy/tests/test_util.py | 20 ++++++++++++++++++-- spacy/util.py | 33 ++++++++++++++++++++++++--------- 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index a0c6ab6c0..207805c81 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -11,13 +11,29 @@ from spacy.util import minibatch_by_words [ ([400, 400, 199], [3]), ([400, 400, 199, 3], [4]), - ([400, 400, 199, 3, 1], [5]), ([400, 400, 199, 3, 200], [3, 2]), + + ([400, 400, 199, 3, 1], [5]), + ([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded ([400, 400, 199, 3, 1, 200], [3, 3]), + ([400, 400, 199, 3, 1, 999], [3, 3]), + ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]), + + ([1, 2, 999], [3]), + ([1, 2, 999, 1], [4]), + ([1, 200, 999, 1], [2, 2]), + ([1, 999, 200, 1], [2, 2]), ], ) def test_util_minibatch(doc_sizes, expected_batches): docs = [get_doc(doc_size) for doc_size in doc_sizes] examples = [Example(doc=doc) for doc in docs] - batches = list(minibatch_by_words(examples=examples, size=1000)) + tol = 0.2 + batch_size = 1000 + batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=True)) assert [len(batch) for batch in batches] == expected_batches + + max_size = batch_size + batch_size * tol + for batch in batches: + assert sum([len(example.doc) for example in batch]) < max_size + diff --git a/spacy/util.py b/spacy/util.py index 598545b84..2d732e2b7 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -671,24 +671,24 @@ def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_o tol_size = target_size * tolerance batch = [] overflow = [] - current_size = 0 + batch_size = 0 overflow_size = 0 for example in examples: n_words = count_words(example.doc) - # if the current example exceeds the batch size, it is returned separately + # if the current example exceeds the maximum batch size, it is returned separately # but only if discard_oversize=False. if n_words > target_size + tol_size: if not discard_oversize: yield [example] # add the example to the current batch if there's no overflow yet and it still fits - elif overflow_size == 0 and (current_size + n_words) < target_size: + elif overflow_size == 0 and (batch_size + n_words) <= target_size: batch.append(example) - current_size += n_words + batch_size += n_words # add the example to the overflow buffer if it fits in the tolerance margin - elif (current_size + overflow_size + n_words) < (target_size + tol_size): + elif (batch_size + overflow_size + n_words) <= (target_size + tol_size): overflow.append(example) overflow_size += n_words @@ -697,14 +697,29 @@ def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_o yield batch target_size = next(size_) tol_size = target_size * tolerance - # In theory it may happen that the current example + overflow examples now exceed the new - # target_size, but that seems like an unimportant edge case if batch sizes are variable? batch = overflow - batch.append(example) - current_size = overflow_size + n_words + batch_size = overflow_size overflow = [] overflow_size = 0 + # this example still fits + if (batch_size + n_words) <= target_size: + batch.append(example) + batch_size += n_words + + # this example fits in overflow + elif (batch_size + n_words) <= (target_size + tol_size): + overflow.append(example) + overflow_size += n_words + + # this example does not fit with the previous overflow: start another new batch + else: + yield batch + target_size = next(size_) + tol_size = target_size * tolerance + batch = [example] + batch_size = n_words + # yield the final batch if batch: batch.extend(overflow) From 2bf5111ecf369a2e5b807067823aadcdc635bc70 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Jun 2020 22:09:37 +0200 Subject: [PATCH 38/39] additional test with discard_oversize=False --- spacy/tests/test_util.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 207805c81..6b6e84a17 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -12,13 +12,11 @@ from spacy.util import minibatch_by_words ([400, 400, 199], [3]), ([400, 400, 199, 3], [4]), ([400, 400, 199, 3, 200], [3, 2]), - ([400, 400, 199, 3, 1], [5]), ([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded ([400, 400, 199, 3, 1, 200], [3, 3]), ([400, 400, 199, 3, 1, 999], [3, 3]), ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]), - ([1, 2, 999], [3]), ([1, 2, 999, 1], [4]), ([1, 200, 999, 1], [2, 2]), @@ -37,3 +35,25 @@ def test_util_minibatch(doc_sizes, expected_batches): for batch in batches: assert sum([len(example.doc) for example in batch]) < max_size + +@pytest.mark.parametrize( + "doc_sizes, expected_batches", + [ + ([400, 4000, 199], [1, 2]), + ([400, 400, 199, 3000, 200], [1, 4]), + ([400, 400, 199, 3, 1, 1500], [1, 5]), + ([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]), + ([1, 2, 9999], [1, 2]), + ([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]), + ], +) +def test_util_minibatch_oversize(doc_sizes, expected_batches): + """ Test that oversized documents are returned in their own batch""" + docs = [get_doc(doc_size) for doc_size in doc_sizes] + examples = [Example(doc=doc) for doc in docs] + tol = 0.2 + batch_size = 1000 + batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=False)) + assert [len(batch) for batch in batches] == expected_batches + + From c5ac382f0aaa03b6ca80d6ad61b11b325ee46702 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Jun 2020 22:24:57 +0200 Subject: [PATCH 39/39] fix name clash --- spacy/tests/test_util.py | 6 +++--- spacy/tests/util.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 6b6e84a17..1410755db 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -1,7 +1,7 @@ import pytest from spacy.gold import Example -from .util import get_doc +from .util import get_random_doc from spacy.util import minibatch_by_words @@ -24,7 +24,7 @@ from spacy.util import minibatch_by_words ], ) def test_util_minibatch(doc_sizes, expected_batches): - docs = [get_doc(doc_size) for doc_size in doc_sizes] + docs = [get_random_doc(doc_size) for doc_size in doc_sizes] examples = [Example(doc=doc) for doc in docs] tol = 0.2 batch_size = 1000 @@ -49,7 +49,7 @@ def test_util_minibatch(doc_sizes, expected_batches): ) def test_util_minibatch_oversize(doc_sizes, expected_batches): """ Test that oversized documents are returned in their own batch""" - docs = [get_doc(doc_size) for doc_size in doc_sizes] + docs = [get_random_doc(doc_size) for doc_size in doc_sizes] examples = [Example(doc=doc) for doc in docs] tol = 0.2 batch_size = 1000 diff --git a/spacy/tests/util.py b/spacy/tests/util.py index 73650a6f7..3d0a023c9 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -92,7 +92,7 @@ def get_batch(batch_size): return docs -def get_doc(n_words): +def get_random_doc(n_words): vocab = Vocab() # Make the words numbers, so that they're easy to track. numbers = [str(i) for i in range(0, n_words)]