diff --git a/Makefile b/Makefile index cf96d6294..9916e3cf5 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ VENV := ./env$(PYVER) version := $(shell "bin/get-version.sh") dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp - $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy_lookups_data + $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) spacy_lookups_data chmod a+rx $@ dist/pytest.pex : wheelhouse/pytest-*.whl @@ -14,7 +14,7 @@ dist/pytest.pex : wheelhouse/pytest-*.whl wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py* $(VENV)/bin/pip wheel . -w ./wheelhouse - $(VENV)/bin/pip wheel jsonschema spacy_lookups_data -w ./wheelhouse + $(VENV)/bin/pip wheel spacy_lookups_data -w ./wheelhouse touch $@ wheelhouse/pytest-%.whl : $(VENV)/bin/pex diff --git a/examples/experiments/onto-joint/defaults.cfg b/examples/experiments/onto-joint/defaults.cfg new file mode 100644 index 000000000..fbac4ea7d --- /dev/null +++ b/examples/experiments/onto-joint/defaults.cfg @@ -0,0 +1,115 @@ +# Training hyper-parameters and additional features. +[training] +# Whether to train on sequences with 'gold standard' sentence boundaries +# and tokens. If you set this to true, take care to ensure your run-time +# data is passed in sentence-by-sentence via some prior preprocessing. +gold_preproc = false +# Limitations on training document length or number of examples. +max_length = 0 +limit = 0 +# Data augmentation +orth_variant_level = 0.0 +dropout = 0.1 +# Controls early-stopping. 0 or -1 mean unlimited. +patience = 1600 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 400 +# Other settings +seed = 0 +accumulate_gradient = 1 +use_pytorch_for_gpu_memory = false +# Control how scores are printed and checkpoints are evaluated. +scores = ["speed", "tags_acc", "uas", "las", "ents_f"] +score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2} +# These settings are invalid for the transformer models. +init_tok2vec = null +vectors = null + +[training.batch_size] +@schedules = "compounding.v1" +start = 1000 +stop = 1000 +compound = 1.001 + +[optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 1e-8 +learn_rate = 0.001 + +#[optimizer.learn_rate] +#@schedules = "warmup_linear.v1" +#warmup_steps = 250 +#total_steps = 20000 +#initial_rate = 0.001 + +[nlp] +lang = "en" +vectors = ${training:vectors} + +[nlp.pipeline.tok2vec] +factory = "tok2vec" + +[nlp.pipeline.senter] +factory = "senter" + +[nlp.pipeline.ner] +factory = "ner" + +[nlp.pipeline.tagger] +factory = "tagger" + +[nlp.pipeline.parser] +factory = "parser" + +[nlp.pipeline.senter.model] +@architectures = "spacy.Tagger.v1" + +[nlp.pipeline.senter.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.tagger.model] +@architectures = "spacy.Tagger.v1" + +[nlp.pipeline.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.parser.model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 8 +hidden_width = 128 +maxout_pieces = 3 +use_upper = false + +[nlp.pipeline.parser.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.ner.model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 3 +hidden_width = 128 +maxout_pieces = 3 +use_upper = false + +[nlp.pipeline.ner.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.tok2vec.model] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = ${nlp:vectors} +width = 256 +depth = 6 +window_size = 1 +embed_size = 10000 +maxout_pieces = 3 +subword_features = true diff --git a/requirements.txt b/requirements.txt index e5f1ae10b..a104b68ba 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,9 +13,11 @@ numpy>=1.15.0 requests>=2.13.0,<3.0.0 plac>=0.9.6,<1.2.0 tqdm>=4.38.0,<5.0.0 -# Optional dependencies -jsonschema>=2.6.0,<3.1.0 pydantic>=1.3.0,<2.0.0 +# Official Python utilities +setuptools +packaging +importlib_metadata>=0.20; python_version < "3.8" # Development dependencies cython>=0.25 pytest>=4.6.5 diff --git a/setup.cfg b/setup.cfg index f0895bbbb..c19b8d857 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,15 +47,17 @@ install_requires = wasabi>=0.4.0,<1.1.0 srsly>=2.0.0,<3.0.0 catalogue>=0.0.7,<1.1.0 - ml_datasets + ml_datasets>=0.1.1 # Third-party dependencies tqdm>=4.38.0,<5.0.0 - setuptools numpy>=1.15.0 plac>=0.9.6,<1.2.0 requests>=2.13.0,<3.0.0 pydantic>=1.3.0,<2.0.0 - tqdm>=4.38.0,<5.0.0 + # Official Python utilities + setuptools + packaging + importlib_metadata>=0.20; python_version < "3.8" [options.extras_require] lookups = diff --git a/spacy/about.py b/spacy/about.py index 3af1b77a0..04a660ad1 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.0.0.dev8" +__version__ = "3.0.0.dev9" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index 0b2920802..1ece755b8 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -111,8 +111,8 @@ def get_entities(lines, tag_pattern, ner_map=None): final entity type with `ner_map` if mapping present. Entity tag is 'O' if the pattern is not matched. - lines (unicode): CONLL-U lines for one sentences - tag_pattern (unicode): Regex pattern for entity tag + lines (str): CONLL-U lines for one sentences + tag_pattern (str): Regex pattern for entity tag ner_map (dict): Map old NER tag names to new ones, '' maps to O. RETURNS (list): List of BILUO entity tags """ @@ -187,8 +187,8 @@ def example_from_conllu_sentence( """Create an Example from the lines for one CoNLL-U sentence, merging subtokens and appending morphology to tags if required. - lines (unicode): The non-comment lines for a CoNLL-U sentence - ner_tag_pattern (unicode): The regex pattern for matching NER in MISC col + lines (str): The non-comment lines for a CoNLL-U sentence + ner_tag_pattern (str): The regex pattern for matching NER in MISC col RETURNS (Example): An example containing the annotation """ # create a Doc with each subtoken as its own token diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 0230e272d..3d56822a5 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -5,6 +5,7 @@ import sys from wasabi import msg from .. import about +from ..util import is_package, get_base_version def download( @@ -17,7 +18,7 @@ def download( flag is set, the command expects the full model name with version. For direct downloads, the compatibility check will be skipped. """ - if not require_package("spacy") and "--no-deps" not in pip_args: + if not is_package("spacy") and "--no-deps" not in pip_args: msg.warn( "Skipping model package dependencies and setting `--no-deps`. " "You don't seem to have the spaCy package itself installed " @@ -45,21 +46,6 @@ def download( "Download and installation successful", f"You can now load the model via spacy.load('{model_name}')", ) - # If a model is downloaded and then loaded within the same process, our - # is_package check currently fails, because pkg_resources.working_set - # is not refreshed automatically (see #3923). We're trying to work - # around this here be requiring the package explicitly. - require_package(model_name) - - -def require_package(name): - try: - import pkg_resources - - pkg_resources.working_set.require(name) - return True - except: # noqa: E722 - return False def get_json(url, desc): @@ -77,8 +63,7 @@ def get_json(url, desc): def get_compatibility(): - version = about.__version__ - version = version.rsplit(".dev", 1)[0] + version = get_base_version(about.__version__) comp_table = get_json(about.__compatibility__, "compatibility table") comp = comp_table["spacy"] if version not in comp: @@ -87,7 +72,7 @@ def get_compatibility(): def get_version(model, comp): - model = model.rsplit(".dev", 1)[0] + model = get_base_version(model) if model not in comp: msg.fail( f"No compatible model found for '{model}' (spaCy v{about.__version__})", diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 23f766368..98fd5cabf 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -48,7 +48,9 @@ def info( "Location": str(Path(__file__).parent.parent), "Platform": platform.platform(), "Python version": platform.python_version(), - "Models": ", ".join(model["name"] for model in all_models.values()), + "Models": ", ".join( + f"{m['name']} ({m['version']})" for m in all_models.values() + ), } if not silent: title = "Info about spaCy" @@ -63,7 +65,7 @@ def print_markdown(data, title=None): """Print data in GitHub-flavoured Markdown format for issues etc. data (dict or list of tuples): Label/value pairs. - title (unicode or None): Title, will be rendered as headline 2. + title (str / None): Title, will be rendered as headline 2. """ markdown = [] for key, value in data.items(): diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 8e27e44d0..153e61ba3 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -83,14 +83,14 @@ def generate_meta(model_path, existing_meta, msg): ("lang", "Model language", meta.get("lang", "en")), ("name", "Model name", meta.get("name", "model")), ("version", "Model version", meta.get("version", "0.0.0")), - ("spacy_version", "Required spaCy version", f">={about.__version__},<3.0.0"), ("description", "Model description", meta.get("description", False)), ("author", "Author", meta.get("author", False)), ("email", "Author email", meta.get("email", False)), ("url", "Author website", meta.get("url", False)), - ("license", "License", meta.get("license", "CC BY-SA 3.0")), + ("license", "License", meta.get("license", "MIT")), ] nlp = util.load_model_from_path(Path(model_path)) + meta["spacy_version"] = util.get_model_version_range(about.__version__) meta["pipeline"] = nlp.pipe_names meta["vectors"] = { "width": nlp.vocab.vectors_length, @@ -168,6 +168,7 @@ def setup_package(): package_data={model_name: list_files(model_dir)}, install_requires=list_requirements(meta), zip_safe=False, + entry_points={'spacy_models': ['{m} = {m}'.format(m=model_name)]} ) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index d4010c43b..cbe977cad 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -483,7 +483,6 @@ def train( # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names - meta["spacy_version"] = f">={about.__version__}" if beam_width == 1: meta["speed"] = { "nwords": nwords, diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index c75c861cc..c0e3bd169 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -7,7 +7,7 @@ from pathlib import Path from wasabi import msg import thinc import thinc.schedules -from thinc.api import Model +from thinc.api import Model, use_pytorch_for_gpu_memory import random from ..gold import GoldCorpus @@ -171,6 +171,8 @@ def train_from_config( msg.info(f"Loading config from: {config_path}") config = util.load_config(config_path, create_objects=False) util.fix_random_seed(config["training"]["seed"]) + if config["training"]["use_pytorch_for_gpu_memory"]: + use_pytorch_for_gpu_memory() nlp_config = config["nlp"] config = util.load_config(config_path, create_objects=True) msg.info("Creating nlp from config") @@ -213,6 +215,12 @@ def train_from_config( if is_best_checkpoint and output_path is not None: nlp.to_disk(output_path) progress = tqdm.tqdm(total=training["eval_frequency"], leave=False) + # Clean up the objects to faciliate garbage collection. + for eg in batch: + eg.doc = None + eg.goldparse = None + eg.doc_annotation = None + eg.token_annotation = None finally: if output_path is not None: final_model_path = output_path / "model-final" diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index a23ce3453..080cd77e2 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -4,6 +4,8 @@ import requests from wasabi import msg from .. import about +from ..util import get_package_version, get_installed_models, get_base_version +from ..util import get_package_path, get_model_meta, is_compatible_version def validate(): @@ -12,7 +14,7 @@ def validate(): with the installed models. Should be run after `pip install -U spacy`. """ model_pkgs, compat = get_model_pkgs() - spacy_version = about.__version__.rsplit(".dev", 1)[0] + spacy_version = get_base_version(about.__version__) current_compat = compat.get(spacy_version, {}) if not current_compat: msg.warn(f"No compatible models found for v{spacy_version} of spaCy") @@ -25,7 +27,7 @@ def validate(): msg.info(f"spaCy installation: {spacy_dir}") if model_pkgs: - header = ("NAME", "VERSION", "") + header = ("NAME", "SPACY", "VERSION", "") rows = [] for name, data in model_pkgs.items(): if data["compat"]: @@ -34,7 +36,7 @@ def validate(): else: version = msg.text(data["version"], color="red", no_print=True) comp = f"--> {compat.get(data['name'], ['n/a'])[0]}" - rows.append((data["name"], version, comp)) + rows.append((data["name"], data["spacy"], version, comp)) msg.table(rows, header=header) else: msg.text("No models found in your current environment.", exits=0) @@ -44,8 +46,9 @@ def validate(): cmd = "python -m spacy download {}" print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n") if na_models: - msg.warn( - f"The following models are not available for spaCy v{about.__version__}:", + msg.info( + f"The following models are custom spaCy models or not " + f"available for spaCy v{about.__version__}:", ", ".join(na_models), ) if incompat_models: @@ -53,8 +56,6 @@ def validate(): def get_model_pkgs(): - import pkg_resources - with msg.loading("Loading compatibility table..."): r = requests.get(about.__compatibility__) if r.status_code != 200: @@ -66,20 +67,29 @@ def get_model_pkgs(): msg.good("Loaded compatibility table") compat = r.json()["spacy"] all_models = set() + installed_models = get_installed_models() for spacy_v, models in dict(compat).items(): all_models.update(models.keys()) for model, model_vs in models.items(): compat[spacy_v][model] = [reformat_version(v) for v in model_vs] pkgs = {} - for pkg_name, pkg_data in pkg_resources.working_set.by_key.items(): + for pkg_name in installed_models: package = pkg_name.replace("-", "_") - if package in all_models: - version = pkg_data.version - pkgs[pkg_name] = { - "name": package, - "version": version, - "compat": package in compat and version in compat[package], - } + version = get_package_version(pkg_name) + if package in compat: + is_compat = version in compat[package] + spacy_version = about.__version__ + else: + model_path = get_package_path(package) + model_meta = get_model_meta(model_path) + spacy_version = model_meta.get("spacy_version", "n/a") + is_compat = is_compatible_version(about.__version__, spacy_version) + pkgs[pkg_name] = { + "name": package, + "version": version, + "spacy": spacy_version, + "compat": is_compat, + } return pkgs, compat diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index 3f84dabce..2c377a043 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -22,13 +22,13 @@ def render( """Render displaCy visualisation. docs (list or Doc): Document(s) to visualise. - style (unicode): Visualisation style, 'dep' or 'ent'. + style (str): Visualisation style, 'dep' or 'ent'. page (bool): Render markup as full HTML page. minify (bool): Minify HTML markup. jupyter (bool): Override Jupyter auto-detection. options (dict): Visualiser-specific options, e.g. colors. manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts. - RETURNS (unicode): Rendered HTML markup. + RETURNS (str): Rendered HTML markup. DOCS: https://spacy.io/api/top-level#displacy.render USAGE: https://spacy.io/usage/visualizers @@ -73,13 +73,13 @@ def serve( """Serve displaCy visualisation. docs (list or Doc): Document(s) to visualise. - style (unicode): Visualisation style, 'dep' or 'ent'. + style (str): Visualisation style, 'dep' or 'ent'. page (bool): Render markup as full HTML page. minify (bool): Minify HTML markup. options (dict): Visualiser-specific options, e.g. colors. manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts. port (int): Port to serve visualisation. - host (unicode): Host to serve visualisation. + host (str): Host to serve visualisation. DOCS: https://spacy.io/api/top-level#displacy.serve USAGE: https://spacy.io/usage/visualizers diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index 0d4cdb77f..ef8632cbc 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -47,7 +47,7 @@ class DependencyRenderer(object): parsed (list): Dependency parses to render. page (bool): Render parses wrapped as full HTML page. minify (bool): Minify HTML markup. - RETURNS (unicode): Rendered SVG or HTML markup. + RETURNS (str): Rendered SVG or HTML markup. """ # Create a random ID prefix to make sure parses don't receive the # same ID, even if they're identical @@ -78,7 +78,7 @@ class DependencyRenderer(object): render_id (int): Unique ID, typically index of document. words (list): Individual words and their tags. arcs (list): Individual arcs and their start, end, direction and label. - RETURNS (unicode): Rendered SVG markup. + RETURNS (str): Rendered SVG markup. """ self.levels = self.get_levels(arcs) self.highest_level = len(self.levels) @@ -112,10 +112,10 @@ class DependencyRenderer(object): ): """Render individual word. - text (unicode): Word text. - tag (unicode): Part-of-speech tag. + text (str): Word text. + tag (str): Part-of-speech tag. i (int): Unique ID, typically word index. - RETURNS (unicode): Rendered SVG markup. + RETURNS (str): Rendered SVG markup. """ y = self.offset_y + self.word_spacing x = self.offset_x + i * self.distance @@ -131,12 +131,12 @@ class DependencyRenderer(object): def render_arrow(self, label, start, end, direction, i): """Render individual arrow. - label (unicode): Dependency label. + label (str): Dependency label. start (int): Index of start word. end (int): Index of end word. - direction (unicode): Arrow direction, 'left' or 'right'. + direction (str): Arrow direction, 'left' or 'right'. i (int): Unique ID, typically arrow index. - RETURNS (unicode): Rendered SVG markup. + RETURNS (str): Rendered SVG markup. """ if start < 0 or end < 0: error_args = dict(start=start, end=end, label=label, dir=direction) @@ -179,7 +179,7 @@ class DependencyRenderer(object): y (int): Y-coordinate of arrow start and end point. y_curve (int): Y-corrdinate of Cubic Bézier y_curve point. x_end (int): X-coordinate of arrow end point. - RETURNS (unicode): Definition of the arc path ('d' attribute). + RETURNS (str): Definition of the arc path ('d' attribute). """ template = "M{x},{y} C{x},{c} {e},{c} {e},{y}" if self.compact: @@ -189,11 +189,11 @@ class DependencyRenderer(object): def get_arrowhead(self, direction, x, y, end): """Render individual arrow head. - direction (unicode): Arrow direction, 'left' or 'right'. + direction (str): Arrow direction, 'left' or 'right'. x (int): X-coordinate of arrow start point. y (int): Y-coordinate of arrow start and end point. end (int): X-coordinate of arrow end point. - RETURNS (unicode): Definition of the arrow head path ('d' attribute). + RETURNS (str): Definition of the arrow head path ('d' attribute). """ if direction == "left": pos1, pos2, pos3 = (x, x - self.arrow_width + 2, x + self.arrow_width - 2) @@ -279,7 +279,7 @@ class EntityRenderer(object): parsed (list): Dependency parses to render. page (bool): Render parses wrapped as full HTML page. minify (bool): Minify HTML markup. - RETURNS (unicode): Rendered HTML markup. + RETURNS (str): Rendered HTML markup. """ rendered = [] for i, p in enumerate(parsed): @@ -300,9 +300,9 @@ class EntityRenderer(object): def render_ents(self, text, spans, title): """Render entities in text. - text (unicode): Original text. + text (str): Original text. spans (list): Individual entity spans and their start, end and label. - title (unicode or None): Document title set in Doc.user_data['title']. + title (str / None): Document title set in Doc.user_data['title']. """ markup = "" offset = 0 diff --git a/spacy/errors.py b/spacy/errors.py index 905f7d443..6184c078c 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -113,9 +113,12 @@ class Warnings(object): "ignored during training.") # TODO: fix numbering after merging develop into master - W095 = ("Skipping unsupported morphological feature(s): {feature}. " - "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or " - "string \"Field1=Value1,Value2|Field2=Value3\".") + W095 = ("Model '{model}' ({model_version}) requires spaCy {version} and is " + "incompatible with the current version ({current}). This may lead " + "to unexpected results or runtime errors. To resolve this, " + "download a newer compatible model or retrain your custom model " + "with the current spaCy version. For more details and available " + "updates, run: python -m spacy validate") W096 = ("The method 'disable_pipes' has become deprecated - use 'select_pipes' " "instead.") W097 = ("No Model config was provided to create the '{name}' component, " @@ -124,6 +127,9 @@ class Warnings(object): "so a default configuration was used.") W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', " "but got '{type}' instead, so ignoring it.") + W100 = ("Skipping unsupported morphological feature(s): {feature}. " + "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or " + "string \"Field1=Value1,Value2|Field2=Value3\".") @add_codes @@ -621,7 +627,7 @@ class MatchPatternError(ValueError): def __init__(self, key, errors): """Custom error for validating match patterns. - key (unicode): The name of the matcher rule. + key (str): The name of the matcher rule. errors (dict): Validation errors (sequence of strings) mapped to pattern ID, i.e. the index of the added pattern. """ diff --git a/spacy/glossary.py b/spacy/glossary.py index 938a575cd..c4a6a5c45 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -1,8 +1,8 @@ def explain(term): """Get a description for a given POS tag, dependency label or entity type. - term (unicode): The term to explain. - RETURNS (unicode): The explanation, or `None` if not found in the glossary. + term (str): The term to explain. + RETURNS (str): The explanation, or `None` if not found in the glossary. EXAMPLE: >>> spacy.explain(u'NORP') diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 27f9f6553..1e58f0635 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -154,8 +154,8 @@ class GoldCorpus(object): def __init__(self, train, dev, gold_preproc=False, limit=None): """Create a GoldCorpus. - train (unicode or Path): File or directory of training data. - dev (unicode or Path): File or directory of development data. + train (str / Path): File or directory of training data. + dev (str / Path): File or directory of development data. RETURNS (GoldCorpus): The newly created object. """ self.limit = limit diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 86a8d49b8..8d8464f3c 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -38,7 +38,7 @@ cdef class Candidate: @property def entity_(self): - """RETURNS (unicode): ID/name of this entity in the KB""" + """RETURNS (str): ID/name of this entity in the KB""" return self.kb.vocab.strings[self.entity_hash] @property @@ -48,7 +48,7 @@ cdef class Candidate: @property def alias_(self): - """RETURNS (unicode): ID of the original alias""" + """RETURNS (str): ID of the original alias""" return self.kb.vocab.strings[self.alias_hash] @property diff --git a/spacy/language.py b/spacy/language.py index f8732b471..f281fa1ba 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -17,7 +17,8 @@ from .tokens.underscore import Underscore from .vocab import Vocab from .lemmatizer import Lemmatizer from .lookups import Lookups -from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs +from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs +from .pipe_analysis import count_pipeline_interdependencies from .gold import Example from .scorer import Scorer from .util import link_vectors_to_models, create_default_optimizer, registry @@ -127,7 +128,7 @@ class Language(object): Defaults (class): Settings, data and factory methods for creating the `nlp` object and processing pipeline. - lang (unicode): Two-letter language ID, i.e. ISO code. + lang (str): Two-letter language ID, i.e. ISO code. DOCS: https://spacy.io/api/language """ @@ -196,13 +197,14 @@ class Language(object): @property def meta(self): + spacy_version = util.get_model_version_range(about.__version__) if self.vocab.lang: self._meta.setdefault("lang", self.vocab.lang) else: self._meta.setdefault("lang", self.lang) self._meta.setdefault("name", "model") self._meta.setdefault("version", "0.0.0") - self._meta.setdefault("spacy_version", f">={about.__version__}") + self._meta.setdefault("spacy_version", spacy_version) self._meta.setdefault("description", "") self._meta.setdefault("author", "") self._meta.setdefault("email", "") @@ -292,7 +294,7 @@ class Language(object): def get_pipe(self, name): """Get a pipeline component for a given component name. - name (unicode): Name of pipeline component to get. + name (str): Name of pipeline component to get. RETURNS (callable): The pipeline component. DOCS: https://spacy.io/api/language#get_pipe @@ -305,7 +307,7 @@ class Language(object): def create_pipe(self, name, config=dict()): """Create a pipeline component from a factory. - name (unicode): Factory name to look up in `Language.factories`. + name (str): Factory name to look up in `Language.factories`. config (dict): Configuration parameters to initialise component. RETURNS (callable): Pipeline component. @@ -348,12 +350,12 @@ class Language(object): of before/after/first/last can be set. Default behaviour is "last". component (callable): The pipeline component. - name (unicode): Name of pipeline component. Overwrites existing + name (str): Name of pipeline component. Overwrites existing component.name attribute if available. If no name is set and the component exposes no name attribute, component.__name__ is used. An error is raised if a name already exists in the pipeline. - before (unicode): Component name to insert component directly before. - after (unicode): Component name to insert component directly after. + before (str): Component name to insert component directly before. + after (str): Component name to insert component directly after. first (bool): Insert component first / not first in the pipeline. last (bool): Insert component last / not last in the pipeline. @@ -394,7 +396,7 @@ class Language(object): """Check if a component name is present in the pipeline. Equivalent to `name in nlp.pipe_names`. - name (unicode): Name of the component. + name (str): Name of the component. RETURNS (bool): Whether a component of the name exists in the pipeline. DOCS: https://spacy.io/api/language#has_pipe @@ -404,7 +406,7 @@ class Language(object): def replace_pipe(self, name, component): """Replace a component in the pipeline. - name (unicode): Name of the component to replace. + name (str): Name of the component to replace. component (callable): Pipeline component. DOCS: https://spacy.io/api/language#replace_pipe @@ -423,8 +425,8 @@ class Language(object): def rename_pipe(self, old_name, new_name): """Rename a pipeline component. - old_name (unicode): Name of the component to rename. - new_name (unicode): New name of the component. + old_name (str): Name of the component to rename. + new_name (str): New name of the component. DOCS: https://spacy.io/api/language#rename_pipe """ @@ -438,7 +440,7 @@ class Language(object): def remove_pipe(self, name): """Remove a component from the pipeline. - name (unicode): Name of the component to remove. + name (str): Name of the component to remove. RETURNS (tuple): A `(name, component)` tuple of the removed component. DOCS: https://spacy.io/api/language#remove_pipe @@ -455,7 +457,7 @@ class Language(object): and can contain arbitrary whitespace. Alignment into the original string is preserved. - text (unicode): The text to be processed. + text (str): The text to be processed. disable (list): Names of the pipeline components to disable. component_cfg (dict): An optional dictionary with extra keyword arguments for specific components. @@ -564,13 +566,14 @@ class Language(object): if component_cfg is None: component_cfg = {} + component_deps = count_pipeline_interdependencies(self.pipeline) # Determine whether component should set annotations. In theory I guess # we should do this by inspecting the meta? Or we could just always # say "yes" - for name, proc in self.pipeline: + for i, (name, proc) in enumerate(self.pipeline): component_cfg.setdefault(name, {}) component_cfg[name].setdefault("drop", drop) - component_cfg[name].setdefault("set_annotations", False) + component_cfg[name]["set_annotations"] = bool(component_deps[i]) for name, proc in self.pipeline: if not hasattr(proc, "update"): continue @@ -938,7 +941,7 @@ class Language(object): """Save the current state to a directory. If a model is loaded, this will include the model. - path (unicode or Path): Path to a directory, which will be created if + path (str / Path): Path to a directory, which will be created if it doesn't exist. exclude (list): Names of components or serialization fields to exclude. @@ -972,7 +975,7 @@ class Language(object): returns it. If the saved `Language` object contains a model, the model will be loaded. - path (unicode or Path): A path to a directory. + path (str / Path): A path to a directory. exclude (list): Names of components or serialization fields to exclude. RETURNS (Language): The modified `Language` object. @@ -1090,7 +1093,7 @@ class component(object): ): """Decorate a pipeline component. - name (unicode): Default component and factory name. + name (str): Default component and factory name. assigns (list): Attributes assigned by component, e.g. `["token.pos"]`. requires (list): Attributes required by component, e.g. `["token.dep"]`. retokenizes (bool): Whether the component changes the tokenization. diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 517a10866..c4944407f 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -30,8 +30,8 @@ class Lemmatizer(object): def __call__(self, string, univ_pos, morphology=None): """Lemmatize a string. - string (unicode): The string to lemmatize, e.g. the token text. - univ_pos (unicode / int): The token's universal part-of-speech tag. + string (str): The string to lemmatize, e.g. the token text. + univ_pos (str / int): The token's universal part-of-speech tag. morphology (dict): The token's morphological features following the Universal Dependencies scheme. RETURNS (list): The available lemmas for the string. @@ -69,7 +69,7 @@ class Lemmatizer(object): Check whether we're dealing with an uninflected paradigm, so we can avoid lemmatization entirely. - univ_pos (unicode / int): The token's universal part-of-speech tag. + univ_pos (str / int): The token's universal part-of-speech tag. morphology (dict): The token's morphological features following the Universal Dependencies scheme. """ @@ -126,10 +126,10 @@ class Lemmatizer(object): """Look up a lemma in the table, if available. If no lemma is found, the original string is returned. - string (unicode): The original string. + string (str): The original string. orth (int): Optional hash of the string to look up. If not set, the string will be used and hashed. - RETURNS (unicode): The lemma if the string was found, otherwise the + RETURNS (str): The lemma if the string was found, otherwise the original string. """ lookup_table = self.lookups.get_table("lemma_lookup", {}) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 40aab697e..fc3b30a6d 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -164,7 +164,7 @@ cdef class Lexeme: self.vocab.set_vector(self.c.orth, vector) property rank: - """RETURNS (unicode): Sequential ID of the lexemes's lexical type, used + """RETURNS (str): Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors.""" def __get__(self): return self.c.id @@ -187,18 +187,18 @@ cdef class Lexeme: @property def orth_(self): - """RETURNS (unicode): The original verbatim text of the lexeme + """RETURNS (str): The original verbatim text of the lexeme (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes.""" return self.vocab.strings[self.c.orth] @property def text(self): - """RETURNS (unicode): The original verbatim text of the lexeme.""" + """RETURNS (str): The original verbatim text of the lexeme.""" return self.orth_ property lower: - """RETURNS (unicode): Lowercase form of the lexeme.""" + """RETURNS (str): Lowercase form of the lexeme.""" def __get__(self): return self.c.lower @@ -281,7 +281,7 @@ cdef class Lexeme: prob_table[self.c.orth] = x property lower_: - """RETURNS (unicode): Lowercase form of the word.""" + """RETURNS (str): Lowercase form of the word.""" def __get__(self): return self.vocab.strings[self.c.lower] @@ -289,7 +289,7 @@ cdef class Lexeme: self.c.lower = self.vocab.strings.add(x) property norm_: - """RETURNS (unicode): The lexemes's norm, i.e. a normalised form of the + """RETURNS (str): The lexemes's norm, i.e. a normalised form of the lexeme text. """ def __get__(self): @@ -299,7 +299,7 @@ cdef class Lexeme: self.norm = self.vocab.strings.add(x) property shape_: - """RETURNS (unicode): Transform of the word's string, to show + """RETURNS (str): Transform of the word's string, to show orthographic features. """ def __get__(self): @@ -309,7 +309,7 @@ cdef class Lexeme: self.c.shape = self.vocab.strings.add(x) property prefix_: - """RETURNS (unicode): Length-N substring from the start of the word. + """RETURNS (str): Length-N substring from the start of the word. Defaults to `N=1`. """ def __get__(self): @@ -319,7 +319,7 @@ cdef class Lexeme: self.c.prefix = self.vocab.strings.add(x) property suffix_: - """RETURNS (unicode): Length-N substring from the end of the word. + """RETURNS (str): Length-N substring from the end of the word. Defaults to `N=3`. """ def __get__(self): @@ -329,7 +329,7 @@ cdef class Lexeme: self.c.suffix = self.vocab.strings.add(x) property lang_: - """RETURNS (unicode): Language of the parent vocabulary.""" + """RETURNS (str): Language of the parent vocabulary.""" def __get__(self): return self.vocab.strings[self.c.lang] diff --git a/spacy/lookups.py b/spacy/lookups.py index 7e49f4dca..d6aa5f9a0 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -31,7 +31,7 @@ class Lookups(object): """Check if the lookups contain a table of a given name. Delegates to Lookups.has_table. - name (unicode): Name of the table. + name (str): Name of the table. RETURNS (bool): Whether a table of that name is in the lookups. """ return self.has_table(name) @@ -48,7 +48,7 @@ class Lookups(object): def add_table(self, name, data=SimpleFrozenDict()): """Add a new table to the lookups. Raises an error if the table exists. - name (unicode): Unique name of table. + name (str): Unique name of table. data (dict): Optional data to add to the table. RETURNS (Table): The newly added table. @@ -64,7 +64,7 @@ class Lookups(object): """Get a table. Raises an error if the table doesn't exist and no default value is provided. - name (unicode): Name of the table. + name (str): Name of the table. default: Optional default value to return if table doesn't exist. RETURNS (Table): The table. @@ -79,7 +79,7 @@ class Lookups(object): def remove_table(self, name): """Remove a table. Raises an error if the table doesn't exist. - name (unicode): Name of the table to remove. + name (str): Name of the table to remove. RETURNS (Table): The removed table. DOCS: https://spacy.io/api/lookups#remove_table @@ -91,7 +91,7 @@ class Lookups(object): def has_table(self, name): """Check if the lookups contain a table of a given name. - name (unicode): Name of the table. + name (str): Name of the table. RETURNS (bool): Whether a table of that name exists. DOCS: https://spacy.io/api/lookups#has_table @@ -125,7 +125,7 @@ class Lookups(object): """Save the lookups to a directory as lookups.bin. Expects a path to a directory, which will be created if it doesn't exist. - path (unicode / Path): The file path. + path (str / Path): The file path. DOCS: https://spacy.io/api/lookups#to_disk """ @@ -141,7 +141,7 @@ class Lookups(object): """Load lookups from a directory containing a lookups.bin. Will skip loading if the file doesn't exist. - path (unicode / Path): The directory path. + path (str / Path): The directory path. RETURNS (Lookups): The loaded lookups. DOCS: https://spacy.io/api/lookups#from_disk @@ -167,7 +167,7 @@ class Table(OrderedDict): """Initialize a new table from a dict. data (dict): The dictionary. - name (unicode): Optional table name for reference. + name (str): Optional table name for reference. RETURNS (Table): The newly created object. DOCS: https://spacy.io/api/lookups#table.from_dict @@ -179,7 +179,7 @@ class Table(OrderedDict): def __init__(self, name=None, data=None): """Initialize a new table. - name (unicode): Optional table name for reference. + name (str): Optional table name for reference. data (dict): Initial data, used to hint Bloom Filter. RETURNS (Table): The newly created object. @@ -197,7 +197,7 @@ class Table(OrderedDict): def __setitem__(self, key, value): """Set new key/value pair. String keys will be hashed. - key (unicode / int): The key to set. + key (str / int): The key to set. value: The value to set. """ key = get_string_id(key) @@ -208,7 +208,7 @@ class Table(OrderedDict): """Set new key/value pair. String keys will be hashed. Same as table[key] = value. - key (unicode / int): The key to set. + key (str / int): The key to set. value: The value to set. """ self[key] = value @@ -216,7 +216,7 @@ class Table(OrderedDict): def __getitem__(self, key): """Get the value for a given key. String keys will be hashed. - key (unicode / int): The key to get. + key (str / int): The key to get. RETURNS: The value. """ key = get_string_id(key) @@ -225,7 +225,7 @@ class Table(OrderedDict): def get(self, key, default=None): """Get the value for a given key. String keys will be hashed. - key (unicode / int): The key to get. + key (str / int): The key to get. default: The default value to return. RETURNS: The value. """ @@ -235,7 +235,7 @@ class Table(OrderedDict): def __contains__(self, key): """Check whether a key is in the table. String keys will be hashed. - key (unicode / int): The key to check. + key (str / int): The key to check. RETURNS (bool): Whether the key is in the table. """ key = get_string_id(key) diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index ff707a71c..ddeeedd06 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -66,7 +66,7 @@ cdef class DependencyMatcher: def __contains__(self, key): """Check whether the matcher contains rules for a match ID. - key (unicode): The match ID. + key (str): The match ID. RETURNS (bool): Whether the matcher contains rules for this match ID. """ return self._normalize_key(key) in self._patterns @@ -194,7 +194,7 @@ cdef class DependencyMatcher: def get(self, key, default=None): """Retrieve the pattern stored for a key. - key (unicode or int): The key to retrieve. + key (str / int): The key to retrieve. RETURNS (tuple): The rule, as an (on_match, patterns) tuple. """ key = self._normalize_key(key) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 8bd66cbca..158730e60 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -64,7 +64,7 @@ cdef class Matcher: def __contains__(self, key): """Check whether the matcher contains rules for a match ID. - key (unicode): The match ID. + key (str): The match ID. RETURNS (bool): Whether the matcher contains rules for this match ID. """ return self._normalize_key(key) in self._patterns @@ -98,7 +98,7 @@ cdef class Matcher: number of arguments). The on_match callback becomes an optional keyword argument. - key (unicode): The match ID. + key (str): The match ID. patterns (list): The patterns to add for the given key. on_match (callable): Optional callback executed on match. *_patterns (list): For backwards compatibility: list of patterns to add @@ -139,7 +139,7 @@ cdef class Matcher: """Remove a rule from the matcher. A KeyError is raised if the key does not exist. - key (unicode): The ID of the match rule. + key (str): The ID of the match rule. """ norm_key = self._normalize_key(key) if not norm_key in self._patterns: @@ -166,7 +166,7 @@ cdef class Matcher: def get(self, key, default=None): """Retrieve the pattern stored for a key. - key (unicode or int): The key to retrieve. + key (str / int): The key to retrieve. RETURNS (tuple): The rule, as an (on_match, patterns) tuple. """ key = self._normalize_key(key) diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 14cc39787..aa4534296 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -30,7 +30,7 @@ cdef class PhraseMatcher: """Initialize the PhraseMatcher. vocab (Vocab): The shared vocabulary. - attr (int / unicode): Token attribute to match on. + attr (int / str): Token attribute to match on. validate (bool): Perform additional validation when patterns are added. RETURNS (PhraseMatcher): The newly constructed object. @@ -70,7 +70,7 @@ cdef class PhraseMatcher: def __contains__(self, key): """Check whether the matcher contains rules for a match ID. - key (unicode): The match ID. + key (str): The match ID. RETURNS (bool): Whether the matcher contains rules for this match ID. DOCS: https://spacy.io/api/phrasematcher#contains @@ -85,7 +85,7 @@ cdef class PhraseMatcher: """Remove a rule from the matcher by match ID. A KeyError is raised if the key does not exist. - key (unicode): The match ID. + key (str): The match ID. DOCS: https://spacy.io/api/phrasematcher#remove """ @@ -159,7 +159,7 @@ cdef class PhraseMatcher: number of arguments). The on_match callback becomes an optional keyword argument. - key (unicode): The match ID. + key (str): The match ID. docs (list): List of `Doc` objects representing match patterns. on_match (callable): Callback executed on match. *_docs (Doc): For backwards compatibility: list of patterns to add diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index 710d36a1d..bdcd709b1 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -15,10 +15,10 @@ def build_tb_parser_model( use_upper=True, nO=None, ): - token_vector_width = tok2vec.get_dim("nO") + t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None tok2vec = chain( tok2vec, - with_array(Linear(hidden_width, token_vector_width)), + with_array(Linear(hidden_width, t2v_width)), list2array(), ) tok2vec.set_dim("nO", hidden_width) diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py index 683c8b518..00e268ede 100644 --- a/spacy/ml/models/tagger.py +++ b/spacy/ml/models/tagger.py @@ -6,9 +6,9 @@ from ...util import registry @registry.architectures.register("spacy.Tagger.v1") def build_tagger_model(tok2vec, nO=None) -> Model: - token_vector_width = tok2vec.get_dim("nO") # TODO: glorot_uniform_init seems to work a bit better than zero_init here?! - output_layer = Softmax(nO, nI=token_vector_width, init_W=zero_init) + t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None + output_layer = Softmax(nO, t2v_width, init_W=zero_init) softmax = with_array(output_layer) model = chain(tok2vec, softmax) model.set_ref("tok2vec", tok2vec) diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index e4301a644..251189389 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -38,8 +38,8 @@ def forward(model, X, is_train): def init(model, X=None, Y=None): - tok2vec = model.get_ref("tok2vec").initialize() - lower = model.get_ref("lower").initialize(X=X) + tok2vec = model.get_ref("tok2vec").initialize(X=X) + lower = model.get_ref("lower").initialize() if model.attrs["has_upper"]: statevecs = model.ops.alloc2f(2, lower.get_dim("nO")) model.get_ref("upper").initialize(X=statevecs) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index f7e38bbea..31d83244c 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -198,8 +198,8 @@ cdef class Morphology: """Add a special-case rule to the morphological analyser. Tokens whose tag and orth match the rule will receive the specified properties. - tag (unicode): The part-of-speech tag to key the exception. - orth (unicode): The word-form to key the exception. + tag (str): The part-of-speech tag to key the exception. + orth (str): The word-form to key the exception. """ attrs = dict(attrs) attrs = _normalize_props(attrs) diff --git a/spacy/analysis.py b/spacy/pipe_analysis.py similarity index 90% rename from spacy/analysis.py rename to spacy/pipe_analysis.py index c2600048f..971ebe518 100644 --- a/spacy/analysis.py +++ b/spacy/pipe_analysis.py @@ -11,7 +11,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True): fulfilled (e.g. if previous components assign the attributes). pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline. - name (unicode): The name of the pipeline component to analyze. + name (str): The name of the pipeline component to analyze. pipe (callable): The pipeline component function to analyze. index (int): The index of the component in the pipeline. warn (bool): Show user warning if problem is found. @@ -125,7 +125,7 @@ def get_assigns_for_attr(pipeline, attr): """Get all pipeline components that assign an attr, e.g. "doc.tensor". pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline. - attr (unicode): The attribute to check. + attr (str): The attribute to check. RETURNS (list): (name, pipeline) tuples of components that assign the attr. """ return _get_feature_for_attr(pipeline, attr, "assigns") @@ -135,7 +135,7 @@ def get_requires_for_attr(pipeline, attr): """Get all pipeline components that require an attr, e.g. "doc.tensor". pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline. - attr (unicode): The attribute to check. + attr (str): The attribute to check. RETURNS (list): (name, pipeline) tuples of components that require the attr. """ return _get_feature_for_attr(pipeline, attr, "requires") @@ -173,3 +173,22 @@ def print_summary(nlp, pretty=True, no_print=False): msg.good("No problems found.") if no_print: return {"overview": overview, "problems": problems} + + +def count_pipeline_interdependencies(pipeline): + """Count how many subsequent components require an annotation set by each + component in the pipeline. + """ + pipe_assigns = [] + pipe_requires = [] + for name, pipe in pipeline: + pipe_assigns.append(set(getattr(pipe, "assigns", []))) + pipe_requires.append(set(getattr(pipe, "requires", []))) + counts = [] + for i, assigns in enumerate(pipe_assigns): + count = 0 + for requires in pipe_requires[i + 1 :]: + if assigns.intersection(requires): + count += 1 + counts.append(count) + return counts diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 58160c2e9..bdc009192 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -30,7 +30,7 @@ class EntityRuler(object): nlp (Language): The shared nlp object to pass the vocab to the matchers and process phrase patterns. - phrase_matcher_attr (int / unicode): Token attribute to match on, passed + phrase_matcher_attr (int / str): Token attribute to match on, passed to the internal PhraseMatcher as `attr` validate (bool): Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate` @@ -315,7 +315,7 @@ class EntityRuler(object): """Load the entity ruler from a file. Expects a file containing newline-delimited JSON (JSONL) with one entry per line. - path (unicode / Path): The JSONL file to load. + path (str / Path): The JSONL file to load. **kwargs: Other config paramters, mostly for consistency. RETURNS (EntityRuler): The loaded entity ruler. @@ -351,7 +351,7 @@ class EntityRuler(object): """Save the entity ruler patterns to a directory. The patterns will be saved as newline-delimited JSON (JSONL). - path (unicode / Path): The JSONL file to save. + path (str / Path): The JSONL file to save. **kwargs: Other config paramters, mostly for consistency. DOCS: https://spacy.io/api/entityruler#to_disk diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index 6e9d4197c..622791512 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -50,7 +50,7 @@ def merge_subtokens(doc, label="subtok"): """Merge subtokens into a single token. doc (Doc): The Doc object. - label (unicode): The subtoken dependency label. + label (str): The subtoken dependency label. RETURNS (Doc): The Doc object with merged subtokens. DOCS: https://spacy.io/api/pipeline-functions#merge_subtokens diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 6804a98c3..42110efb0 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -531,7 +531,16 @@ class Tagger(Pipe): vocab.morphology.lemmatizer, exc=vocab.morphology.exc) self.set_output(len(self.labels)) - self.model.initialize() + doc_sample = [Doc(self.vocab, words=["hello", "world"])] + if pipeline is not None: + for name, component in pipeline: + if component is self: + break + if hasattr(component, "pipe"): + doc_sample = list(component.pipe(doc_sample)) + else: + doc_sample = [component(doc) for doc in doc_sample] + self.model.initialize(X=doc_sample) # Get batch of example docs, example outputs to call begin_training(). # This lets the model infer shapes. link_vectors_to_models(self.vocab) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index a30f11729..9e584ce8a 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -109,7 +109,7 @@ cdef class StringStore: """Retrieve a string from a given hash, or vice versa. string_or_id (bytes, unicode or uint64): The value to encode. - Returns (unicode or uint64): The value to be retrieved. + Returns (str / uint64): The value to be retrieved. """ if isinstance(string_or_id, basestring) and len(string_or_id) == 0: return 0 @@ -152,7 +152,7 @@ cdef class StringStore: def add(self, string): """Add a string to the StringStore. - string (unicode): The string to add. + string (str): The string to add. RETURNS (uint64): The string's hash value. """ if isinstance(string, unicode): @@ -179,7 +179,7 @@ cdef class StringStore: def __contains__(self, string not None): """Check whether a string is in the store. - string (unicode): The string to check. + string (str): The string to check. RETURNS (bool): Whether the store contains the string. """ cdef hash_t key @@ -205,7 +205,7 @@ cdef class StringStore: def __iter__(self): """Iterate over the strings in the store, in order. - YIELDS (unicode): A string in the store. + YIELDS (str): A string in the store. """ cdef int i cdef hash_t key @@ -223,7 +223,7 @@ cdef class StringStore: def to_disk(self, path): """Save the current state to a directory. - path (unicode or Path): A path to a directory, which will be created if + path (str / Path): A path to a directory, which will be created if it doesn't exist. Paths may be either strings or Path-like objects. """ path = util.ensure_path(path) @@ -234,7 +234,7 @@ cdef class StringStore: """Loads state from a directory. Modifies the object in place and returns it. - path (unicode or Path): A path to a directory. Paths may be either + path (str / Path): A path to a directory. Paths may be either strings or `Path`-like objects. RETURNS (StringStore): The modified `StringStore` object. """ diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 1437bdd98..fcaff444e 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -624,12 +624,25 @@ cdef class Parser: sgd = self.create_optimizer() doc_sample = [] gold_sample = [] - for example in islice(get_examples(), 1000): + for example in islice(get_examples(), 10): parses = example.get_gold_parses(merge=False, vocab=self.vocab) for doc, gold in parses: - doc_sample.append(doc) - gold_sample.append(gold) - self.model.initialize(doc_sample, gold_sample) + if len(doc): + doc_sample.append(doc) + gold_sample.append(gold) + + if pipeline is not None: + for name, component in pipeline: + if component is self: + break + if hasattr(component, "pipe"): + doc_sample = list(component.pipe(doc_sample)) + else: + doc_sample = [component(doc) for doc in doc_sample] + if doc_sample: + self.model.initialize(doc_sample) + else: + self.model.initialize() if pipeline is not None: self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) link_vectors_to_models(self.vocab) diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py index 59a8569ee..0dc0f9d6c 100644 --- a/spacy/tests/package/test_requirements.py +++ b/spacy/tests/package/test_requirements.py @@ -9,7 +9,6 @@ def test_build_dependencies(): "pytest-timeout", "mock", "flake8", - "jsonschema", ] libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"] diff --git a/spacy/tests/pipeline/test_analysis.py b/spacy/tests/pipeline/test_analysis.py index cda39f6ee..b826438f5 100644 --- a/spacy/tests/pipeline/test_analysis.py +++ b/spacy/tests/pipeline/test_analysis.py @@ -1,7 +1,8 @@ import spacy.language from spacy.language import Language, component -from spacy.analysis import print_summary, validate_attrs -from spacy.analysis import get_assigns_for_attr, get_requires_for_attr +from spacy.pipe_analysis import print_summary, validate_attrs +from spacy.pipe_analysis import get_assigns_for_attr, get_requires_for_attr +from spacy.pipe_analysis import count_pipeline_interdependencies from mock import Mock, ANY import pytest @@ -161,3 +162,19 @@ def test_analysis_validate_attrs_remove_pipe(): with pytest.warns(None) as record: nlp.remove_pipe("c2") assert not record.list + + +def test_pipe_interdependencies(): + class Fancifier: + name = "fancifier" + assigns = ("doc._.fancy",) + requires = tuple() + + class FancyNeeder: + name = "needer" + assigns = tuple() + requires = ("doc._.fancy",) + + pipeline = [("fancifier", Fancifier()), ("needer", FancyNeeder())] + counts = count_pipeline_interdependencies(pipeline) + assert counts == [1, 0] diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index c320b19c0..e4b4e570c 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -2,9 +2,11 @@ import pytest import os import ctypes from pathlib import Path +from spacy.about import __version__ as spacy_version from spacy import util from spacy import prefer_gpu, require_gpu -from spacy.ml._precomputable_affine import PrecomputableAffine, _backprop_precomputable_affine_padding +from spacy.ml._precomputable_affine import PrecomputableAffine +from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding @pytest.fixture @@ -24,10 +26,12 @@ def test_util_ensure_path_succeeds(text): assert isinstance(path, Path) -@pytest.mark.parametrize("package", ["numpy"]) -def test_util_is_package(package): +@pytest.mark.parametrize( + "package,result", [("numpy", True), ("sfkodskfosdkfpsdpofkspdof", False)] +) +def test_util_is_package(package, result): """Test that an installed package via pip is recognised by util.is_package.""" - assert util.is_package(package) + assert util.is_package(package) is result @pytest.mark.parametrize("package", ["thinc"]) @@ -87,3 +91,21 @@ def test_ascii_filenames(): root = Path(__file__).parent.parent for path in root.glob("**/*"): assert all(ord(c) < 128 for c in path.name), path.name + + +@pytest.mark.parametrize( + "version,constraint,compatible", + [ + (spacy_version, spacy_version, True), + (spacy_version, f">={spacy_version}", True), + ("3.0.0", "2.0.0", False), + ("3.2.1", ">=2.0.0", True), + ("2.2.10a1", ">=1.0.0,<2.1.1", False), + ("3.0.0.dev3", ">=1.2.3,<4.5.6", True), + ("n/a", ">=1.2.3,<4.5.6", None), + ("1.2.3", "n/a", None), + ("n/a", "n/a", None), + ], +) +def test_is_compatible_version(version, constraint, compatible): + assert util.is_compatible_version(version, constraint) is compatible diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py new file mode 100644 index 000000000..1410755db --- /dev/null +++ b/spacy/tests/test_util.py @@ -0,0 +1,59 @@ +import pytest +from spacy.gold import Example + +from .util import get_random_doc + +from spacy.util import minibatch_by_words + + +@pytest.mark.parametrize( + "doc_sizes, expected_batches", + [ + ([400, 400, 199], [3]), + ([400, 400, 199, 3], [4]), + ([400, 400, 199, 3, 200], [3, 2]), + ([400, 400, 199, 3, 1], [5]), + ([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded + ([400, 400, 199, 3, 1, 200], [3, 3]), + ([400, 400, 199, 3, 1, 999], [3, 3]), + ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]), + ([1, 2, 999], [3]), + ([1, 2, 999, 1], [4]), + ([1, 200, 999, 1], [2, 2]), + ([1, 999, 200, 1], [2, 2]), + ], +) +def test_util_minibatch(doc_sizes, expected_batches): + docs = [get_random_doc(doc_size) for doc_size in doc_sizes] + examples = [Example(doc=doc) for doc in docs] + tol = 0.2 + batch_size = 1000 + batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=True)) + assert [len(batch) for batch in batches] == expected_batches + + max_size = batch_size + batch_size * tol + for batch in batches: + assert sum([len(example.doc) for example in batch]) < max_size + + +@pytest.mark.parametrize( + "doc_sizes, expected_batches", + [ + ([400, 4000, 199], [1, 2]), + ([400, 400, 199, 3000, 200], [1, 4]), + ([400, 400, 199, 3, 1, 1500], [1, 5]), + ([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]), + ([1, 2, 9999], [1, 2]), + ([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]), + ], +) +def test_util_minibatch_oversize(doc_sizes, expected_batches): + """ Test that oversized documents are returned in their own batch""" + docs = [get_random_doc(doc_size) for doc_size in doc_sizes] + examples = [Example(doc=doc) for doc in docs] + tol = 0.2 + batch_size = 1000 + batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=False)) + assert [len(batch) for batch in batches] == expected_batches + + diff --git a/spacy/tests/util.py b/spacy/tests/util.py index e29342268..3d0a023c9 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -92,6 +92,13 @@ def get_batch(batch_size): return docs +def get_random_doc(n_words): + vocab = Vocab() + # Make the words numbers, so that they're easy to track. + numbers = [str(i) for i in range(0, n_words)] + return Doc(vocab, words=numbers) + + def apply_transition_sequence(parser, doc, sequence): """Perform a series of pre-specified transitions, to put the parser in a desired state.""" diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 7e75052f7..538bf60e9 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -134,7 +134,7 @@ cdef class Tokenizer: def __call__(self, unicode string): """Tokenize a string. - string (unicode): The string to tokenize. + string (str): The string to tokenize. RETURNS (Doc): A container for linguistic annotations. DOCS: https://spacy.io/api/tokenizer#call @@ -147,7 +147,7 @@ cdef class Tokenizer: cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases): """Tokenize according to affix and token_match settings. - string (unicode): The string to tokenize. + string (str): The string to tokenize. RETURNS (Doc): A container for linguistic annotations. """ if len(string) >= (2 ** 30): @@ -527,7 +527,7 @@ cdef class Tokenizer: def find_infix(self, unicode string): """Find internal split points of the string, such as hyphens. - string (unicode): The string to segment. + string (str): The string to segment. RETURNS (list): A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. @@ -542,7 +542,7 @@ cdef class Tokenizer: """Find the length of a prefix that should be segmented from the string, or None if no prefix rules match. - string (unicode): The string to segment. + string (str): The string to segment. RETURNS (int): The length of the prefix if present, otherwise `None`. DOCS: https://spacy.io/api/tokenizer#find_prefix @@ -556,7 +556,7 @@ cdef class Tokenizer: """Find the length of a suffix that should be segmented from the string, or None if no suffix rules match. - string (unicode): The string to segment. + string (str): The string to segment. Returns (int): The length of the suffix if present, otherwise `None`. DOCS: https://spacy.io/api/tokenizer#find_suffix @@ -576,7 +576,7 @@ cdef class Tokenizer: def _validate_special_case(self, chunk, substrings): """Check whether the `ORTH` fields match the string. - string (unicode): The string to specially tokenize. + string (str): The string to specially tokenize. substrings (iterable): A sequence of dicts, where each dict describes a token and its attributes. """ @@ -588,7 +588,7 @@ cdef class Tokenizer: def add_special_case(self, unicode string, substrings): """Add a special-case tokenization rule. - string (unicode): The string to specially tokenize. + string (str): The string to specially tokenize. substrings (iterable): A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. @@ -629,7 +629,7 @@ cdef class Tokenizer: produced are identical to `nlp.tokenizer()` except for whitespace tokens. - string (unicode): The string to tokenize. + string (str): The string to tokenize. RETURNS (list): A list of (pattern_string, token_string) tuples DOCS: https://spacy.io/api/tokenizer#explain @@ -693,7 +693,7 @@ cdef class Tokenizer: def to_disk(self, path, **kwargs): """Save the current state to a directory. - path (unicode or Path): A path to a directory, which will be created if + path (str / Path): A path to a directory, which will be created if it doesn't exist. exclude (list): String names of serialization fields to exclude. @@ -707,7 +707,7 @@ cdef class Tokenizer: """Loads state from a directory. Modifies the object in place and returns it. - path (unicode or Path): A path to a directory. + path (str / Path): A path to a directory. exclude (list): String names of serialization fields to exclude. RETURNS (Tokenizer): The modified `Tokenizer` object. diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index e6841eb80..debab6aeb 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -117,7 +117,7 @@ cdef class Doc: def set_extension(cls, name, **kwargs): """Define a custom attribute which becomes available as `Doc._`. - name (unicode): Name of the attribute to set. + name (str): Name of the attribute to set. default: Optional default value of the attribute. getter (callable): Optional getter function. setter (callable): Optional setter function. @@ -135,7 +135,7 @@ cdef class Doc: def get_extension(cls, name): """Look up a previously registered extension by name. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple. DOCS: https://spacy.io/api/doc#get_extension @@ -146,7 +146,7 @@ cdef class Doc: def has_extension(cls, name): """Check whether an extension has been registered. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (bool): Whether the extension has been registered. DOCS: https://spacy.io/api/doc#has_extension @@ -157,7 +157,7 @@ cdef class Doc: def remove_extension(cls, name): """Remove a previously registered extension. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple of the removed extension. @@ -483,7 +483,7 @@ cdef class Doc: def text(self): """A unicode representation of the document text. - RETURNS (unicode): The original verbatim text of the document. + RETURNS (str): The original verbatim text of the document. """ return "".join(t.text_with_ws for t in self) @@ -492,7 +492,7 @@ cdef class Doc: """An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. - RETURNS (unicode): The original verbatim text of the document. + RETURNS (str): The original verbatim text of the document. """ return self.text @@ -637,7 +637,7 @@ cdef class Doc: @property def lang_(self): - """RETURNS (unicode): Language of the doc's vocabulary, e.g. 'en'.""" + """RETURNS (str): Language of the doc's vocabulary, e.g. 'en'.""" return self.vocab.lang cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1: @@ -852,7 +852,7 @@ cdef class Doc: def to_disk(self, path, **kwargs): """Save the current state to a directory. - path (unicode or Path): A path to a directory, which will be created if + path (str / Path): A path to a directory, which will be created if it doesn't exist. Paths may be either strings or Path-like objects. exclude (list): String names of serialization fields to exclude. @@ -866,7 +866,7 @@ cdef class Doc: """Loads state from a directory. Modifies the object in place and returns it. - path (unicode or Path): A path to a directory. Paths may be either + path (str / Path): A path to a directory. Paths may be either strings or `Path`-like objects. exclude (list): String names of serialization fields to exclude. RETURNS (Doc): The modified `Doc` object. diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index e9b151985..b8f79f8a6 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -33,7 +33,7 @@ cdef class Span: def set_extension(cls, name, **kwargs): """Define a custom attribute which becomes available as `Span._`. - name (unicode): Name of the attribute to set. + name (str): Name of the attribute to set. default: Optional default value of the attribute. getter (callable): Optional getter function. setter (callable): Optional setter function. @@ -51,7 +51,7 @@ cdef class Span: def get_extension(cls, name): """Look up a previously registered extension by name. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple. DOCS: https://spacy.io/api/span#get_extension @@ -62,7 +62,7 @@ cdef class Span: def has_extension(cls, name): """Check whether an extension has been registered. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (bool): Whether the extension has been registered. DOCS: https://spacy.io/api/span#has_extension @@ -73,7 +73,7 @@ cdef class Span: def remove_extension(cls, name): """Remove a previously registered extension. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple of the removed extension. @@ -491,7 +491,7 @@ cdef class Span: @property def text(self): - """RETURNS (unicode): The original verbatim text of the span.""" + """RETURNS (str): The original verbatim text of the span.""" text = self.text_with_ws if self[-1].whitespace_: text = text[:-1] @@ -502,7 +502,7 @@ cdef class Span: """The text content of the span with a trailing whitespace character if the last token has one. - RETURNS (unicode): The text content of the span (with trailing + RETURNS (str): The text content of the span (with trailing whitespace). """ return "".join([t.text_with_ws for t in self]) @@ -678,7 +678,7 @@ cdef class Span: raise NotImplementedError(TempErrors.T007.format(attr="ent_id")) property ent_id_: - """RETURNS (unicode): The (string) entity ID.""" + """RETURNS (str): The (string) entity ID.""" def __get__(self): return self.root.ent_id_ @@ -690,12 +690,12 @@ cdef class Span: """Verbatim text content (identical to `Span.text`). Exists mostly for consistency with other attributes. - RETURNS (unicode): The span's text.""" + RETURNS (str): The span's text.""" return self.text @property def lemma_(self): - """RETURNS (unicode): The span's lemma.""" + """RETURNS (str): The span's lemma.""" return " ".join([t.lemma_ for t in self]).strip() @property @@ -714,7 +714,7 @@ cdef class Span: return "".join([t.text_with_ws for t in self]) property label_: - """RETURNS (unicode): The span's label.""" + """RETURNS (str): The span's label.""" def __get__(self): return self.doc.vocab.strings[self.label] @@ -724,7 +724,7 @@ cdef class Span: raise NotImplementedError(Errors.E129.format(start=self.start, end=self.end, label=label_)) property kb_id_: - """RETURNS (unicode): The named entity's KB ID.""" + """RETURNS (str): The named entity's KB ID.""" def __get__(self): return self.doc.vocab.strings[self.kb_id] diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 58e9196ea..320cfaad5 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -36,7 +36,7 @@ cdef class Token: def set_extension(cls, name, **kwargs): """Define a custom attribute which becomes available as `Token._`. - name (unicode): Name of the attribute to set. + name (str): Name of the attribute to set. default: Optional default value of the attribute. getter (callable): Optional getter function. setter (callable): Optional setter function. @@ -54,7 +54,7 @@ cdef class Token: def get_extension(cls, name): """Look up a previously registered extension by name. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple. DOCS: https://spacy.io/api/token#get_extension @@ -65,7 +65,7 @@ cdef class Token: def has_extension(cls, name): """Check whether an extension has been registered. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (bool): Whether the extension has been registered. DOCS: https://spacy.io/api/token#has_extension @@ -76,7 +76,7 @@ cdef class Token: def remove_extension(cls, name): """Remove a previously registered extension. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple of the removed extension. @@ -244,12 +244,12 @@ cdef class Token: @property def text(self): - """RETURNS (unicode): The original verbatim text of the token.""" + """RETURNS (str): The original verbatim text of the token.""" return self.orth_ @property def text_with_ws(self): - """RETURNS (unicode): The text content of the span (with trailing + """RETURNS (str): The text content of the span (with trailing whitespace). """ cdef unicode orth = self.vocab.strings[self.c.lex.orth] @@ -762,7 +762,7 @@ cdef class Token: self.c.ent_type = ent_type property ent_type_: - """RETURNS (unicode): Named entity type.""" + """RETURNS (str): Named entity type.""" def __get__(self): return self.vocab.strings[self.c.ent_type] @@ -785,7 +785,7 @@ cdef class Token: and "" means no entity tag is set. "B" with an empty ent_type means that the token is blocked from further processing by NER. - RETURNS (unicode): IOB code of named entity tag. + RETURNS (str): IOB code of named entity tag. """ iob_strings = ("", "I", "O", "B") return iob_strings[self.c.ent_iob] @@ -801,7 +801,7 @@ cdef class Token: self.c.ent_id = key property ent_id_: - """RETURNS (unicode): ID of the entity the token is an instance of, + """RETURNS (str): ID of the entity the token is an instance of, if any. """ def __get__(self): @@ -819,7 +819,7 @@ cdef class Token: self.c.ent_kb_id = ent_kb_id property ent_kb_id_: - """RETURNS (unicode): Named entity KB ID.""" + """RETURNS (str): Named entity KB ID.""" def __get__(self): return self.vocab.strings[self.c.ent_kb_id] @@ -828,12 +828,12 @@ cdef class Token: @property def whitespace_(self): - """RETURNS (unicode): The trailing whitespace character, if present.""" + """RETURNS (str): The trailing whitespace character, if present.""" return " " if self.c.spacy else "" @property def orth_(self): - """RETURNS (unicode): Verbatim text content (identical to + """RETURNS (str): Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. """ @@ -841,13 +841,13 @@ cdef class Token: @property def lower_(self): - """RETURNS (unicode): The lowercase token text. Equivalent to + """RETURNS (str): The lowercase token text. Equivalent to `Token.text.lower()`. """ return self.vocab.strings[self.c.lex.lower] property norm_: - """RETURNS (unicode): The token's norm, i.e. a normalised form of the + """RETURNS (str): The token's norm, i.e. a normalised form of the token text. Usually set in the language's tokenizer exceptions or norm exceptions. """ @@ -859,34 +859,34 @@ cdef class Token: @property def shape_(self): - """RETURNS (unicode): Transform of the tokens's string, to show + """RETURNS (str): Transform of the tokens's string, to show orthographic features. For example, "Xxxx" or "dd". """ return self.vocab.strings[self.c.lex.shape] @property def prefix_(self): - """RETURNS (unicode): A length-N substring from the start of the token. + """RETURNS (str): A length-N substring from the start of the token. Defaults to `N=1`. """ return self.vocab.strings[self.c.lex.prefix] @property def suffix_(self): - """RETURNS (unicode): A length-N substring from the end of the token. + """RETURNS (str): A length-N substring from the end of the token. Defaults to `N=3`. """ return self.vocab.strings[self.c.lex.suffix] @property def lang_(self): - """RETURNS (unicode): Language of the parent document's vocabulary, + """RETURNS (str): Language of the parent document's vocabulary, e.g. 'en'. """ return self.vocab.strings[self.c.lex.lang] property lemma_: - """RETURNS (unicode): The token lemma, i.e. the base form of the word, + """RETURNS (str): The token lemma, i.e. the base form of the word, with no inflectional suffixes. """ def __get__(self): @@ -899,7 +899,7 @@ cdef class Token: self.c.lemma = self.vocab.strings.add(lemma_) property pos_: - """RETURNS (unicode): Coarse-grained part-of-speech tag.""" + """RETURNS (str): Coarse-grained part-of-speech tag.""" def __get__(self): return parts_of_speech.NAMES[self.c.pos] @@ -907,7 +907,7 @@ cdef class Token: self.c.pos = parts_of_speech.IDS[pos_name] property tag_: - """RETURNS (unicode): Fine-grained part-of-speech tag.""" + """RETURNS (str): Fine-grained part-of-speech tag.""" def __get__(self): return self.vocab.strings[self.c.tag] @@ -915,7 +915,7 @@ cdef class Token: self.tag = self.vocab.strings.add(tag) property dep_: - """RETURNS (unicode): The syntactic dependency label.""" + """RETURNS (str): The syntactic dependency label.""" def __get__(self): return self.vocab.strings[self.c.dep] diff --git a/spacy/util.py b/spacy/util.py index a6ccae075..97cc5a8d7 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -15,6 +15,8 @@ import srsly import catalogue import sys import warnings +from packaging.specifiers import SpecifierSet, InvalidSpecifier +from packaging.version import Version, InvalidVersion try: @@ -22,9 +24,16 @@ try: except ImportError: cupy = None +try: # Python 3.8 + import importlib.metadata as importlib_metadata +except ImportError: + import importlib_metadata + from .symbols import ORTH from .compat import cupy, CudaStream from .errors import Errors, Warnings +from . import about + _PRINT_ENV = False OOV_RANK = numpy.iinfo(numpy.uint64).max @@ -37,6 +46,10 @@ class registry(thinc.registry): factories = catalogue.create("spacy", "factories", entry_points=True) displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True) assets = catalogue.create("spacy", "assets", entry_points=True) + # This is mostly used to get a list of all installed models in the current + # environment. spaCy models packaged with `spacy package` will "advertise" + # themselves via entry points. + models = catalogue.create("spacy", "models", entry_points=True) def set_env_log(value): @@ -49,7 +62,7 @@ def lang_class_is_loaded(lang): loaded lazily, to avoid expensive setup code associated with the language data. - lang (unicode): Two-letter language code, e.g. 'en'. + lang (str): Two-letter language code, e.g. 'en'. RETURNS (bool): Whether a Language class has been loaded. """ return lang in registry.languages @@ -58,7 +71,7 @@ def lang_class_is_loaded(lang): def get_lang_class(lang): """Import and load a Language class. - lang (unicode): Two-letter language code, e.g. 'en'. + lang (str): Two-letter language code, e.g. 'en'. RETURNS (Language): Language class. """ # Check if language is registered / entry point is available @@ -76,7 +89,7 @@ def get_lang_class(lang): def set_lang_class(name, cls): """Set a custom Language class name that can be loaded via get_lang_class. - name (unicode): Name of Language class. + name (str): Name of Language class. cls (Language): Language class. """ registry.languages.register(name, func=cls) @@ -98,7 +111,7 @@ def load_language_data(path): """Load JSON language data using the given path as a base. If the provided path isn't present, will attempt to load a gzipped version before giving up. - path (unicode / Path): The data to load. + path (str / Path): The data to load. RETURNS: The loaded data. """ path = ensure_path(path) @@ -119,7 +132,7 @@ def get_module_path(module): def load_model(name, **overrides): """Load a model from a package or data path. - name (unicode): Package name or model path. + name (str): Package name or model path. **overrides: Specific overrides, like pipeline components to disable. RETURNS (Language): `Language` class with the loaded model. """ @@ -193,7 +206,7 @@ def load_model_from_init_py(init_file, **overrides): """Helper function to use in the `load()` method of a model package's __init__.py. - init_file (unicode): Path to model's __init__.py, i.e. `__file__`. + init_file (str): Path to model's __init__.py, i.e. `__file__`. **overrides: Specific overrides, like pipeline components to disable. RETURNS (Language): `Language` class with loaded model. """ @@ -206,11 +219,74 @@ def load_model_from_init_py(init_file, **overrides): return load_model_from_path(data_path, meta, **overrides) +def get_installed_models(): + """List all model packages currently installed in the environment. + + RETURNS (list): The string names of the models. + """ + return list(registry.models.get_all().keys()) + + +def get_package_version(name): + """Get the version of an installed package. Typically used to get model + package versions. + + name (str): The name of the installed Python package. + RETURNS (str / None): The version or None if package not installed. + """ + try: + return importlib_metadata.version(name) + except importlib_metadata.PackageNotFoundError: + return None + + +def is_compatible_version(version, constraint, prereleases=True): + """Check if a version (e.g. "2.0.0") is compatible given a version + constraint (e.g. ">=1.9.0,<2.2.1"). If the constraint is a specific version, + it's interpreted as =={version}. + + version (str): The version to check. + constraint (str): The constraint string. + prereleases (bool): Whether to allow prereleases. If set to False, + prerelease versions will be considered incompatible. + RETURNS (bool / None): Whether the version is compatible, or None if the + version or constraint are invalid. + """ + # Handle cases where exact version is provided as constraint + if constraint[0].isdigit(): + constraint = f"=={constraint}" + try: + spec = SpecifierSet(constraint) + version = Version(version) + except (InvalidSpecifier, InvalidVersion): + return None + spec.prereleases = prereleases + return version in spec + + +def get_model_version_range(spacy_version): + """Generate a version range like >=1.2.3,<1.3.0 based on a given spaCy + version. Models are always compatible across patch versions but not + across minor or major versions. + """ + release = Version(spacy_version).release + return f">={spacy_version},<{release[0]}.{release[1] + 1}.0" + + +def get_base_version(version): + """Generate the base version without any prerelease identifiers. + + version (str): The version, e.g. "3.0.0.dev1". + RETURNS (str): The base version, e.g. "3.0.0". + """ + return Version(version).base_version + + def load_config(path, create_objects=False): """Load a Thinc-formatted config file, optionally filling in objects where the config references registry entries. See "Thinc config files" for details. - path (unicode or Path): Path to the config file + path (str / Path): Path to the config file create_objects (bool): Whether to automatically create objects when the config references registry entries. Defaults to False. @@ -227,7 +303,7 @@ def load_config_from_str(string, create_objects=False): """Load a Thinc-formatted config, optionally filling in objects where the config references registry entries. See "Thinc config files" for details. - string (unicode or Path): Text contents of the config file. + string (str / Path): Text contents of the config file. create_objects (bool): Whether to automatically create objects when the config references registry entries. Defaults to False. @@ -243,7 +319,7 @@ def load_config_from_str(string, create_objects=False): def get_model_meta(path): """Get model meta.json from a directory path and validate its contents. - path (unicode or Path): Path to model directory. + path (str / Path): Path to model directory. RETURNS (dict): The model's meta data. """ model_path = ensure_path(path) @@ -256,13 +332,23 @@ def get_model_meta(path): for setting in ["lang", "name", "version"]: if setting not in meta or not meta[setting]: raise ValueError(Errors.E054.format(setting=setting)) + if "spacy_version" in meta: + if not is_compatible_version(about.__version__, meta["spacy_version"]): + warnings.warn( + Warnings.W095.format( + model=f"{meta['lang']}_{meta['name']}", + model_version=meta["version"], + version=meta["spacy_version"], + current=about.__version__, + ) + ) return meta def get_model_config(path): """Get the model's config from a directory path. - path (unicode or Path): Path to model directory. + path (str / Path): Path to model directory. RETURNS (Config): The model's config data. """ model_path = ensure_path(path) @@ -279,23 +365,20 @@ def get_model_config(path): def is_package(name): """Check if string maps to a package installed via pip. - name (unicode): Name of package. + name (str): Name of package. RETURNS (bool): True if installed package, False if not. """ - import pkg_resources - - name = name.lower() # compare package name against lowercase name - packages = pkg_resources.working_set.by_key.keys() - for package in packages: - if package.lower().replace("-", "_") == name: - return True - return False + try: + importlib_metadata.distribution(name) + return True + except: # noqa: E722 + return False def get_package_path(name): """Get the path to an installed package. - name (unicode): Package name. + name (str): Package name. RETURNS (Path): Path to installed package. """ name = name.lower() # use lowercase version to be safe @@ -470,8 +553,8 @@ def expand_exc(excs, search, replace): For example, to add additional versions with typographic apostrophes. excs (dict): Tokenizer exceptions. - search (unicode): String to find and replace. - replace (unicode): Replacement. + search (str): String to find and replace. + replace (str): Replacement. RETURNS (dict): Combined tokenizer exceptions. """ @@ -575,42 +658,74 @@ def decaying(start, stop, decay): curr -= decay -def minibatch_by_words(examples, size, tuples=True, count_words=len, tolerance=0.2): +def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_oversize=False): """Create minibatches of roughly a given number of words. If any examples are longer than the specified batch length, they will appear in a batch by - themselves.""" + themselves, or be discarded if discard_oversize=True.""" if isinstance(size, int): size_ = itertools.repeat(size) elif isinstance(size, List): size_ = iter(size) else: size_ = size - examples = iter(examples) - oversize = [] - while True: - batch_size = next(size_) - tol_size = batch_size * 0.2 - batch = [] - if oversize: - example = oversize.pop(0) - n_words = count_words(example.doc) + + target_size = next(size_) + tol_size = target_size * tolerance + batch = [] + overflow = [] + batch_size = 0 + overflow_size = 0 + + for example in examples: + n_words = count_words(example.doc) + # if the current example exceeds the maximum batch size, it is returned separately + # but only if discard_oversize=False. + if n_words > target_size + tol_size: + if not discard_oversize: + yield [example] + + # add the example to the current batch if there's no overflow yet and it still fits + elif overflow_size == 0 and (batch_size + n_words) <= target_size: batch.append(example) - batch_size -= n_words - while batch_size >= 1: - try: - example = next(examples) - except StopIteration: - if batch: - yield batch - return - n_words = count_words(example.doc) - if n_words < (batch_size + tol_size): - batch_size -= n_words - batch.append(example) - else: - oversize.append(example) - if batch: + batch_size += n_words + + # add the example to the overflow buffer if it fits in the tolerance margin + elif (batch_size + overflow_size + n_words) <= (target_size + tol_size): + overflow.append(example) + overflow_size += n_words + + # yield the previous batch and start a new one. The new one gets the overflow examples. + else: yield batch + target_size = next(size_) + tol_size = target_size * tolerance + batch = overflow + batch_size = overflow_size + overflow = [] + overflow_size = 0 + + # this example still fits + if (batch_size + n_words) <= target_size: + batch.append(example) + batch_size += n_words + + # this example fits in overflow + elif (batch_size + n_words) <= (target_size + tol_size): + overflow.append(example) + overflow_size += n_words + + # this example does not fit with the previous overflow: start another new batch + else: + yield batch + target_size = next(size_) + tol_size = target_size * tolerance + batch = [example] + batch_size = n_words + + # yield the final batch + if batch: + batch.extend(overflow) + yield batch def itershuffle(iterable, bufsize=1000): @@ -705,8 +820,8 @@ def from_disk(path, readers, exclude): def import_file(name, loc): """Import module from a file. Used to load models from a directory. - name (unicode): Name of module to load. - loc (unicode / Path): Path to the file. + name (str): Name of module to load. + loc (str / Path): Path to the file. RETURNS: The loaded module. """ loc = str(loc) @@ -721,8 +836,8 @@ def minify_html(html): Disclaimer: NOT a general-purpose solution, only removes indentation and newlines. - html (unicode): Markup to minify. - RETURNS (unicode): "Minified" HTML. + html (str): Markup to minify. + RETURNS (str): "Minified" HTML. """ return html.strip().replace(" ", "").replace("\n", "") @@ -731,8 +846,8 @@ def escape_html(text): """Replace <, >, &, " with their HTML encoded representation. Intended to prevent HTML errors in rendered displaCy markup. - text (unicode): The original text. - RETURNS (unicode): Equivalent text to be safely used within HTML. + text (str): The original text. + RETURNS (str): Equivalent text to be safely used within HTML. """ text = text.replace("&", "&") text = text.replace("<", "<") diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 471c6463f..4537d612d 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -57,7 +57,7 @@ cdef class Vectors: shape (tuple): Size of the table, as (# entries, # columns) data (numpy.ndarray): The vector data. keys (iterable): A sequence of keys, aligned with the data. - name (unicode): A name to identify the vectors table. + name (str): A name to identify the vectors table. RETURNS (Vectors): The newly created object. DOCS: https://spacy.io/api/vectors#init @@ -244,7 +244,7 @@ cdef class Vectors: def find(self, *, key=None, keys=None, row=None, rows=None): """Look up one or more keys by row, or vice versa. - key (unicode / int): Find the row that the given key points to. + key (str / int): Find the row that the given key points to. Returns int, -1 if missing. keys (iterable): Find rows that the keys point to. Returns ndarray. @@ -366,7 +366,7 @@ cdef class Vectors: def to_disk(self, path, **kwargs): """Save the current state to a directory. - path (unicode / Path): A path to a directory, which will be created if + path (str / Path): A path to a directory, which will be created if it doesn't exists. DOCS: https://spacy.io/api/vectors#to_disk @@ -386,7 +386,7 @@ cdef class Vectors: """Loads state from a directory. Modifies the object in place and returns it. - path (unicode / Path): Directory path, string or Path-like object. + path (str / Path): Directory path, string or Path-like object. RETURNS (Vectors): The modified object. DOCS: https://spacy.io/api/vectors#from_disk diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 505977be9..aacfb414c 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -504,10 +504,10 @@ tokenization can be provided. > srsly.write_jsonl("/path/to/text.jsonl", data) > ``` -| Key | Type | Description | -| -------- | ------- | ---------------------------------------------------------- | -| `text` | unicode | The raw input text. Is not required if `tokens` available. | -| `tokens` | list | Optional tokenization, one string per token. | +| Key | Type | Description | +| -------- | ---- | ---------------------------------------------------------- | +| `text` | str | The raw input text. Is not required if `tokens` available. | +| `tokens` | list | Optional tokenization, one string per token. | ```json ### Example diff --git a/website/docs/api/cython-classes.md b/website/docs/api/cython-classes.md index 77d6fdd10..9dea04284 100644 --- a/website/docs/api/cython-classes.md +++ b/website/docs/api/cython-classes.md @@ -170,7 +170,7 @@ vocabulary. | Name | Type | Description | | ----------- | ---------------- | ------------------------------------------------------------------------------------------- | | `mem` | `cymem.Pool` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. | -| `string` | unicode | The string of the word to look up. | +| `string` | str | The string of the word to look up. | | **RETURNS** | `const LexemeC*` | The lexeme in the vocabulary. | ### Vocab.get_by_orth {#vocab_get_by_orth tag="method"} diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index df0df3e38..0980dc2e0 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -229,9 +229,9 @@ Add a new label to the pipe. > parser.add_label("MY_LABEL") > ``` -| Name | Type | Description | -| ------- | ------- | ----------------- | -| `label` | unicode | The label to add. | +| Name | Type | Description | +| ------- | ---- | ----------------- | +| `label` | str | The label to add. | ## DependencyParser.to_disk {#to_disk tag="method"} @@ -244,10 +244,10 @@ Serialize the pipe to disk. > parser.to_disk("/path/to/parser") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## DependencyParser.from_disk {#from_disk tag="method"} @@ -262,7 +262,7 @@ Load the pipe from disk. Modifies the object in place and returns it. | Name | Type | Description | | ----------- | ------------------ | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. | diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 7decc2278..50fb10756 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -123,7 +123,7 @@ details, see the documentation on | Name | Type | Description | | --------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------- | -| `name` | unicode | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `doc._.my_attr`. | +| `name` | str | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `doc._.my_attr`. | | `default` | - | Optional default value of the attribute if no getter or method is defined. | | `method` | callable | Set a custom method on the object, for example `doc._.compare(other_doc)`. | | `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. | @@ -145,10 +145,10 @@ Look up a previously registered extension by name. Returns a 4-tuple > assert extension == (False, None, None, None) > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------------------------- | -| `name` | unicode | Name of the extension. | -| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | +| Name | Type | Description | +| ----------- | ----- | ------------------------------------------------------------- | +| `name` | str | Name of the extension. | +| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | ## Doc.has_extension {#has_extension tag="classmethod" new="2"} @@ -162,10 +162,10 @@ Check whether an extension has been registered on the `Doc` class. > assert Doc.has_extension('has_city') > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------ | -| `name` | unicode | Name of the extension to check. | -| **RETURNS** | bool | Whether the extension has been registered. | +| Name | Type | Description | +| ----------- | ---- | ------------------------------------------ | +| `name` | str | Name of the extension to check. | +| **RETURNS** | bool | Whether the extension has been registered. | ## Doc.remove_extension {#remove_extension tag="classmethod" new="2.0.12"} @@ -180,10 +180,10 @@ Remove a previously registered extension. > assert not Doc.has_extension('has_city') > ``` -| Name | Type | Description | -| ----------- | ------- | --------------------------------------------------------------------- | -| `name` | unicode | Name of the extension. | -| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | +| Name | Type | Description | +| ----------- | ----- | --------------------------------------------------------------------- | +| `name` | str | Name of the extension. | +| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | ## Doc.char_span {#char_span tag="method" new="2"} @@ -368,10 +368,10 @@ Save the current state to a directory. > doc.to_disk("/path/to/doc") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## Doc.from_disk {#from_disk tag="method" new="2"} @@ -385,11 +385,11 @@ Loads state from a directory. Modifies the object in place and returns it. > doc = Doc(Vocab()).from_disk("/path/to/doc") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Doc` | The modified `Doc` object. | +| Name | Type | Description | +| ----------- | ------------ | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Doc` | The modified `Doc` object. | ## Doc.to_bytes {#to_bytes tag="method"} @@ -648,15 +648,15 @@ The L2 norm of the document's vector representation. | Name | Type | Description | | --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `text` | unicode | A unicode representation of the document text. | -| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. | +| `text` | str | A unicode representation of the document text. | +| `text_with_ws` | str | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. | | `mem` | `Pool` | The document's local memory heap, for all C data it owns. | | `vocab` | `Vocab` | The store of lexical types. | | `tensor` 2 | `ndarray` | Container for dense vector representations. | | `cats` 2 | dict | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. | | `user_data` | - | A generic storage area, for user custom data. | | `lang` 2.1 | int | Language of the document's vocabulary. | -| `lang_` 2.1 | unicode | Language of the document's vocabulary. | +| `lang_` 2.1 | str | Language of the document's vocabulary. | | `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. | | `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. | | `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. | diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index a9d6a31a5..d7f25ed56 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -258,10 +258,10 @@ Serialize the pipe to disk. > entity_linker.to_disk("/path/to/entity_linker") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## EntityLinker.from_disk {#from_disk tag="method"} @@ -274,11 +274,11 @@ Load the pipe from disk. Modifies the object in place and returns it. > entity_linker.from_disk("/path/to/entity_linker") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. | +| Name | Type | Description | +| ----------- | -------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 9345ee249..c9a81f6f1 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -230,9 +230,9 @@ Add a new label to the pipe. > ner.add_label("MY_LABEL") > ``` -| Name | Type | Description | -| ------- | ------- | ----------------- | -| `label` | unicode | The label to add. | +| Name | Type | Description | +| ------- | ---- | ----------------- | +| `label` | str | The label to add. | ## EntityRecognizer.to_disk {#to_disk tag="method"} @@ -245,10 +245,10 @@ Serialize the pipe to disk. > ner.to_disk("/path/to/ner") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## EntityRecognizer.from_disk {#from_disk tag="method"} @@ -263,7 +263,7 @@ Load the pipe from disk. Modifies the object in place and returns it. | Name | Type | Description | | ----------- | ------------------ | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. | diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index 0fd24897d..7bee3a77a 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -72,10 +72,10 @@ Whether a label is present in the patterns. > assert not "PERSON" in ruler > ``` -| Name | Type | Description | -| ----------- | ------- | -------------------------------------------- | -| `label` | unicode | The label to check. | -| **RETURNS** | bool | Whether the entity ruler contains the label. | +| Name | Type | Description | +| ----------- | ---- | -------------------------------------------- | +| `label` | str | The label to check. | +| **RETURNS** | bool | Whether the entity ruler contains the label. | ## EntityRuler.\_\_call\_\_ {#call tag="method"} @@ -83,8 +83,9 @@ Find matches in the `Doc` and add them to the `doc.ents`. Typically, this happens automatically after the component has been added to the pipeline using [`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized with `overwrite_ents=True`, existing entities will be replaced if they overlap -with the matches. When matches overlap in a Doc, the entity ruler prioritizes longer -patterns over shorter, and if equal the match occuring first in the Doc is chosen. +with the matches. When matches overlap in a Doc, the entity ruler prioritizes +longer patterns over shorter, and if equal the match occuring first in the Doc +is chosen. > #### Example > @@ -139,9 +140,9 @@ only the patterns are saved as JSONL. If a directory name is provided, a > ruler.to_disk("/path/to/entity_ruler") # saves patterns and config > ``` -| Name | Type | Description | -| ------ | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| ------ | ------------ | ----------------------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | ## EntityRuler.from_disk {#from_disk tag="method"} @@ -158,10 +159,10 @@ configuration. > ruler.from_disk("/path/to/entity_ruler") # loads patterns and config > ``` -| Name | Type | Description | -| ----------- | ---------------- | ---------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. | -| **RETURNS** | `EntityRuler` | The modified `EntityRuler` object. | +| Name | Type | Description | +| ----------- | ------------- | ---------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. | +| **RETURNS** | `EntityRuler` | The modified `EntityRuler` object. | ## EntityRuler.to_bytes {#to_bytes tag="method"} diff --git a/website/docs/api/goldcorpus.md b/website/docs/api/goldcorpus.md index a18ef4d32..7767b28bd 100644 --- a/website/docs/api/goldcorpus.md +++ b/website/docs/api/goldcorpus.md @@ -17,8 +17,8 @@ Create a `GoldCorpus`. IF the input data is an iterable, each item should be a [`gold.read_json_file`](https://github.com/explosion/spaCy/tree/master/spacy/gold.pyx) for further details. -| Name | Type | Description | -| ----------- | --------------------------- | ------------------------------------------------------------ | -| `train` | unicode / `Path` / iterable | Training data, as a path (file or directory) or iterable. | -| `dev` | unicode / `Path` / iterable | Development data, as a path (file or directory) or iterable. | -| **RETURNS** | `GoldCorpus` | The newly constructed object. | +| Name | Type | Description | +| ----------- | ----------------------- | ------------------------------------------------------------ | +| `train` | str / `Path` / iterable | Training data, as a path (file or directory) or iterable. | +| `dev` | str / `Path` / iterable | Development data, as a path (file or directory) or iterable. | +| **RETURNS** | `GoldCorpus` | The newly constructed object. | diff --git a/website/docs/api/goldparse.md b/website/docs/api/goldparse.md index 443913311..379913ba2 100644 --- a/website/docs/api/goldparse.md +++ b/website/docs/api/goldparse.md @@ -60,7 +60,8 @@ Whether the provided syntactic annotations form a projective dependency tree. Convert a list of Doc objects into the [JSON-serializable format](/api/annotation#json-input) used by the -[`spacy train`](/api/cli#train) command. Each input doc will be treated as a 'paragraph' in the output doc. +[`spacy train`](/api/cli#train) command. Each input doc will be treated as a +'paragraph' in the output doc. > #### Example > @@ -158,7 +159,7 @@ single-token entity. | ----------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | | `doc` | `Doc` | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. | | `entities` | iterable | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. | -| **RETURNS** | list | Unicode strings, describing the [BILUO](/api/annotation#biluo) tags. | +| **RETURNS** | list | str strings, describing the [BILUO](/api/annotation#biluo) tags. | ### gold.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"} diff --git a/website/docs/api/kb.md b/website/docs/api/kb.md index eeba85e84..f088815fd 100644 --- a/website/docs/api/kb.md +++ b/website/docs/api/kb.md @@ -1,16 +1,19 @@ --- title: KnowledgeBase -teaser: A storage class for entities and aliases of a specific knowledge base (ontology) +teaser: + A storage class for entities and aliases of a specific knowledge base + (ontology) tag: class source: spacy/kb.pyx new: 2.2 --- -The `KnowledgeBase` object provides a method to generate [`Candidate`](/api/kb/#candidate_init) -objects, which are plausible external identifiers given a certain textual mention. -Each such `Candidate` holds information from the relevant KB entities, -such as its frequency in text and possible aliases. -Each entity in the knowledge base also has a pretrained entity vector of a fixed size. +The `KnowledgeBase` object provides a method to generate +[`Candidate`](/api/kb/#candidate_init) objects, which are plausible external +identifiers given a certain textual mention. Each such `Candidate` holds +information from the relevant KB entities, such as its frequency in text and +possible aliases. Each entity in the knowledge base also has a pretrained entity +vector of a fixed size. ## KnowledgeBase.\_\_init\_\_ {#init tag="method"} @@ -24,25 +27,25 @@ Create the knowledge base. > kb = KnowledgeBase(vocab=vocab, entity_vector_length=64) > ``` -| Name | Type | Description | -| ----------------------- | ---------------- | ----------------------------------------- | -| `vocab` | `Vocab` | A `Vocab` object. | -| `entity_vector_length` | int | Length of the fixed-size entity vectors. | -| **RETURNS** | `KnowledgeBase` | The newly constructed object. | - +| Name | Type | Description | +| ---------------------- | --------------- | ---------------------------------------- | +| `vocab` | `Vocab` | A `Vocab` object. | +| `entity_vector_length` | int | Length of the fixed-size entity vectors. | +| **RETURNS** | `KnowledgeBase` | The newly constructed object. | ## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"} The length of the fixed-size entity vectors in the knowledge base. -| Name | Type | Description | -| ----------- | ---- | ----------------------------------------- | -| **RETURNS** | int | Length of the fixed-size entity vectors. | +| Name | Type | Description | +| ----------- | ---- | ---------------------------------------- | +| **RETURNS** | int | Length of the fixed-size entity vectors. | ## KnowledgeBase.add_entity {#add_entity tag="method"} -Add an entity to the knowledge base, specifying its corpus frequency -and entity vector, which should be of length [`entity_vector_length`](/api/kb#entity_vector_length). +Add an entity to the knowledge base, specifying its corpus frequency and entity +vector, which should be of length +[`entity_vector_length`](/api/kb#entity_vector_length). > #### Example > @@ -51,16 +54,16 @@ and entity vector, which should be of length [`entity_vector_length`](/api/kb#en > kb.add_entity(entity="Q463035", freq=111, entity_vector=vector2) > ``` -| Name | Type | Description | -| --------------- | ------------- | ------------------------------------------------- | -| `entity` | unicode | The unique entity identifier | -| `freq` | float | The frequency of the entity in a typical corpus | -| `entity_vector` | vector | The pretrained vector of the entity | +| Name | Type | Description | +| --------------- | ------ | ----------------------------------------------- | +| `entity` | str | The unique entity identifier | +| `freq` | float | The frequency of the entity in a typical corpus | +| `entity_vector` | vector | The pretrained vector of the entity | ## KnowledgeBase.set_entities {#set_entities tag="method"} -Define the full list of entities in the knowledge base, specifying the corpus frequency -and entity vector for each entity. +Define the full list of entities in the knowledge base, specifying the corpus +frequency and entity vector for each entity. > #### Example > @@ -68,18 +71,19 @@ and entity vector for each entity. > kb.set_entities(entity_list=["Q42", "Q463035"], freq_list=[32, 111], vector_list=[vector1, vector2]) > ``` -| Name | Type | Description | -| ------------- | ------------- | ------------------------------------------------- | -| `entity_list` | iterable | List of unique entity identifiers | -| `freq_list` | iterable | List of entity frequencies | -| `vector_list` | iterable | List of entity vectors | +| Name | Type | Description | +| ------------- | -------- | --------------------------------- | +| `entity_list` | iterable | List of unique entity identifiers | +| `freq_list` | iterable | List of entity frequencies | +| `vector_list` | iterable | List of entity vectors | ## KnowledgeBase.add_alias {#add_alias tag="method"} -Add an alias or mention to the knowledge base, specifying its potential KB identifiers -and their prior probabilities. The entity identifiers should refer to entities previously -added with [`add_entity`](/api/kb#add_entity) or [`set_entities`](/api/kb#set_entities). -The sum of the prior probabilities should not exceed 1. +Add an alias or mention to the knowledge base, specifying its potential KB +identifiers and their prior probabilities. The entity identifiers should refer +to entities previously added with [`add_entity`](/api/kb#add_entity) or +[`set_entities`](/api/kb#set_entities). The sum of the prior probabilities +should not exceed 1. > #### Example > @@ -87,11 +91,11 @@ The sum of the prior probabilities should not exceed 1. > kb.add_alias(alias="Douglas", entities=["Q42", "Q463035"], probabilities=[0.6, 0.3]) > ``` -| Name | Type | Description | -| -------------- | ------------- | -------------------------------------------------- | -| `alias` | unicode | The textual mention or alias | -| `entities` | iterable | The potential entities that the alias may refer to | -| `probabilities`| iterable | The prior probabilities of each entity | +| Name | Type | Description | +| --------------- | -------- | -------------------------------------------------- | +| `alias` | str | The textual mention or alias | +| `entities` | iterable | The potential entities that the alias may refer to | +| `probabilities` | iterable | The prior probabilities of each entity | ## KnowledgeBase.\_\_len\_\_ {#len tag="method"} @@ -117,9 +121,9 @@ Get a list of all entity IDs in the knowledge base. > all_entities = kb.get_entity_strings() > ``` -| Name | Type | Description | -| ----------- | ---- | --------------------------------------------- | -| **RETURNS** | list | The list of entities in the knowledge base. | +| Name | Type | Description | +| ----------- | ---- | ------------------------------------------- | +| **RETURNS** | list | The list of entities in the knowledge base. | ## KnowledgeBase.get_size_aliases {#get_size_aliases tag="method"} @@ -131,9 +135,9 @@ Get the total number of aliases in the knowledge base. > total_aliases = kb.get_size_aliases() > ``` -| Name | Type | Description | -| ----------- | ---- | --------------------------------------------- | -| **RETURNS** | int | The number of aliases in the knowledge base. | +| Name | Type | Description | +| ----------- | ---- | -------------------------------------------- | +| **RETURNS** | int | The number of aliases in the knowledge base. | ## KnowledgeBase.get_alias_strings {#get_alias_strings tag="method"} @@ -145,9 +149,9 @@ Get a list of all aliases in the knowledge base. > all_aliases = kb.get_alias_strings() > ``` -| Name | Type | Description | -| ----------- | ---- | --------------------------------------------- | -| **RETURNS** | list | The list of aliases in the knowledge base. | +| Name | Type | Description | +| ----------- | ---- | ------------------------------------------ | +| **RETURNS** | list | The list of aliases in the knowledge base. | ## KnowledgeBase.get_candidates {#get_candidates tag="method"} @@ -160,10 +164,10 @@ of type [`Candidate`](/api/kb/#candidate_init). > candidates = kb.get_candidates("Douglas") > ``` -| Name | Type | Description | -| ------------- | ------------- | -------------------------------------------------- | -| `alias` | unicode | The textual mention or alias | -| **RETURNS** | iterable | The list of relevant `Candidate` objects | +| Name | Type | Description | +| ----------- | -------- | ---------------------------------------- | +| `alias` | str | The textual mention or alias | +| **RETURNS** | iterable | The list of relevant `Candidate` objects | ## KnowledgeBase.get_vector {#get_vector tag="method"} @@ -175,15 +179,15 @@ Given a certain entity ID, retrieve its pretrained entity vector. > vector = kb.get_vector("Q42") > ``` -| Name | Type | Description | -| ------------- | ------------- | -------------------------------------------------- | -| `entity` | unicode | The entity ID | -| **RETURNS** | vector | The entity vector | +| Name | Type | Description | +| ----------- | ------ | ----------------- | +| `entity` | str | The entity ID | +| **RETURNS** | vector | The entity vector | ## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"} -Given a certain entity ID and a certain textual mention, retrieve -the prior probability of the fact that the mention links to the entity ID. +Given a certain entity ID and a certain textual mention, retrieve the prior +probability of the fact that the mention links to the entity ID. > #### Example > @@ -191,11 +195,11 @@ the prior probability of the fact that the mention links to the entity ID. > probability = kb.get_prior_prob("Q42", "Douglas") > ``` -| Name | Type | Description | -| ------------- | ------------- | --------------------------------------------------------------- | -| `entity` | unicode | The entity ID | -| `alias` | unicode | The textual mention or alias | -| **RETURNS** | float | The prior probability of the `alias` referring to the `entity` | +| Name | Type | Description | +| ----------- | ----- | -------------------------------------------------------------- | +| `entity` | str | The entity ID | +| `alias` | str | The textual mention or alias | +| **RETURNS** | float | The prior probability of the `alias` referring to the `entity` | ## KnowledgeBase.dump {#dump tag="method"} @@ -207,14 +211,14 @@ Save the current state of the knowledge base to a directory. > kb.dump(loc) > ``` -| Name | Type | Description | -| ------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------ | -| `loc` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| ----- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `loc` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | ## KnowledgeBase.load_bulk {#load_bulk tag="method"} -Restore the state of the knowledge base from a given directory. Note that the [`Vocab`](/api/vocab) -should also be the same as the one used to create the KB. +Restore the state of the knowledge base from a given directory. Note that the +[`Vocab`](/api/vocab) should also be the same as the one used to create the KB. > #### Example > @@ -226,18 +230,16 @@ should also be the same as the one used to create the KB. > kb.load_bulk("/path/to/kb") > ``` - -| Name | Type | Description | -| ----------- | ---------------- | ----------------------------------------------------------------------------------------- | -| `loc` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| **RETURNS** | `KnowledgeBase` | The modified `KnowledgeBase` object. | - +| Name | Type | Description | +| ----------- | --------------- | -------------------------------------------------------------------------- | +| `loc` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| **RETURNS** | `KnowledgeBase` | The modified `KnowledgeBase` object. | ## Candidate.\_\_init\_\_ {#candidate_init tag="method"} Construct a `Candidate` object. Usually this constructor is not called directly, -but instead these objects are returned by the [`get_candidates`](/api/kb#get_candidates) method -of a `KnowledgeBase`. +but instead these objects are returned by the +[`get_candidates`](/api/kb#get_candidates) method of a `KnowledgeBase`. > #### Example > @@ -257,12 +259,12 @@ of a `KnowledgeBase`. ## Candidate attributes {#candidate_attributes} -| Name | Type | Description | -| ---------------------- | ------------ | ------------------------------------------------------------------ | -| `entity` | int | The entity's unique KB identifier | -| `entity_` | unicode | The entity's unique KB identifier | -| `alias` | int | The alias or textual mention | -| `alias_` | unicode | The alias or textual mention | -| `prior_prob` | long | The prior probability of the `alias` referring to the `entity` | -| `entity_freq` | long | The frequency of the entity in a typical corpus | -| `entity_vector` | vector | The pretrained vector of the entity | +| Name | Type | Description | +| --------------- | ------ | -------------------------------------------------------------- | +| `entity` | int | The entity's unique KB identifier | +| `entity_` | str | The entity's unique KB identifier | +| `alias` | int | The alias or textual mention | +| `alias_` | str | The alias or textual mention | +| `prior_prob` | long | The prior probability of the `alias` referring to the `entity` | +| `entity_freq` | long | The frequency of the entity in a typical corpus | +| `entity_vector` | vector | The pretrained vector of the entity | diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 50689a7ef..e1991f260 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -49,11 +49,11 @@ contain arbitrary whitespace. Alignment into the original string is preserved. > assert (doc[0].text, doc[0].head.tag_) == ("An", "NN") > ``` -| Name | Type | Description | -| ----------- | ------- | --------------------------------------------------------------------------------- | -| `text` | unicode | The text to be processed. | -| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | -| **RETURNS** | `Doc` | A container for accessing the annotations. | +| Name | Type | Description | +| ----------- | ----- | --------------------------------------------------------------------------------- | +| `text` | str | The text to be processed. | +| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | +| **RETURNS** | `Doc` | A container for accessing the annotations. | @@ -201,7 +201,7 @@ Create a pipeline component from a factory. | Name | Type | Description | | ----------- | -------- | ---------------------------------------------------------------------------------- | -| `name` | unicode | Factory name to look up in [`Language.factories`](/api/language#class-attributes). | +| `name` | str | Factory name to look up in [`Language.factories`](/api/language#class-attributes). | | `config` | dict | Configuration parameters to initialize component. | | **RETURNS** | callable | The pipeline component. | @@ -224,9 +224,9 @@ take a `Doc` object, modify it and return it. Only one of `before`, `after`, | Name | Type | Description | | ----------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `component` | callable | The pipeline component. | -| `name` | unicode | Name of pipeline component. Overwrites existing `component.name` attribute if available. If no `name` is set and the component exposes no name attribute, `component.__name__` is used. An error is raised if the name already exists in the pipeline. | -| `before` | unicode | Component name to insert component directly before. | -| `after` | unicode | Component name to insert component directly after: | +| `name` | str | Name of pipeline component. Overwrites existing `component.name` attribute if available. If no `name` is set and the component exposes no name attribute, `component.__name__` is used. An error is raised if the name already exists in the pipeline. | +| `before` | str | Component name to insert component directly before. | +| `after` | str | Component name to insert component directly after: | | `first` | bool | Insert component first / not first in the pipeline. | | `last` | bool | Insert component last / not last in the pipeline. | @@ -243,10 +243,10 @@ Check whether a component is present in the pipeline. Equivalent to > assert nlp.has_pipe("component") > ``` -| Name | Type | Description | -| ----------- | ------- | -------------------------------------------------------- | -| `name` | unicode | Name of the pipeline component to check. | -| **RETURNS** | bool | Whether a component of that name exists in the pipeline. | +| Name | Type | Description | +| ----------- | ---- | -------------------------------------------------------- | +| `name` | str | Name of the pipeline component to check. | +| **RETURNS** | bool | Whether a component of that name exists in the pipeline. | ## Language.get_pipe {#get_pipe tag="method" new="2"} @@ -261,7 +261,7 @@ Get a pipeline component for a given component name. | Name | Type | Description | | ----------- | -------- | -------------------------------------- | -| `name` | unicode | Name of the pipeline component to get. | +| `name` | str | Name of the pipeline component to get. | | **RETURNS** | callable | The pipeline component. | ## Language.replace_pipe {#replace_pipe tag="method" new="2"} @@ -276,7 +276,7 @@ Replace a component in the pipeline. | Name | Type | Description | | ----------- | -------- | --------------------------------- | -| `name` | unicode | Name of the component to replace. | +| `name` | str | Name of the component to replace. | | `component` | callable | The pipeline component to insert. | ## Language.rename_pipe {#rename_pipe tag="method" new="2"} @@ -292,10 +292,10 @@ added to the pipeline, you can also use the `name` argument on > nlp.rename_pipe("parser", "spacy_parser") > ``` -| Name | Type | Description | -| ---------- | ------- | -------------------------------- | -| `old_name` | unicode | Name of the component to rename. | -| `new_name` | unicode | New name of the component. | +| Name | Type | Description | +| ---------- | ---- | -------------------------------- | +| `old_name` | str | Name of the component to rename. | +| `new_name` | str | New name of the component. | ## Language.remove_pipe {#remove_pipe tag="method" new="2"} @@ -309,10 +309,10 @@ component function. > assert name == "parser" > ``` -| Name | Type | Description | -| ----------- | ------- | ----------------------------------------------------- | -| `name` | unicode | Name of the component to remove. | -| **RETURNS** | tuple | A `(name, component)` tuple of the removed component. | +| Name | Type | Description | +| ----------- | ----- | ----------------------------------------------------- | +| `name` | str | Name of the component to remove. | +| **RETURNS** | tuple | A `(name, component)` tuple of the removed component. | ## Language.select_pipes {#select_pipes tag="contextmanager, method" new="3"} @@ -342,12 +342,11 @@ latter case, all components not in the `enable` list, will be disabled. | Name | Type | Description | | ----------- | --------------- | ------------------------------------------------------------------------------------ | | `disable` | list | Names of pipeline components to disable. | -| `disable` | unicode | Name of pipeline component to disable. | +| `disable` | str | Name of pipeline component to disable. | | `enable` | list | Names of pipeline components that will not be disabled. | -| `enable` | unicode | Name of pipeline component that will not be disabled. | +| `enable` | str | Name of pipeline component that will not be disabled. | | **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. | - As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`: @@ -370,10 +369,10 @@ the model**. > nlp.to_disk("/path/to/models") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | ## Language.from_disk {#from_disk tag="method" new="2"} @@ -395,11 +394,11 @@ loaded object. > nlp = English().from_disk("/path/to/en_model") > ``` -| Name | Type | Description | -| ----------- | ---------------- | ----------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Language` | The modified `Language` object. | +| Name | Type | Description | +| ----------- | ------------ | ----------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Language` | The modified `Language` object. | @@ -480,11 +479,11 @@ per component. ## Class attributes {#class-attributes} -| Name | Type | Description | -| -------------------------------------- | ------- | ----------------------------------------------------------------------------------------------------------------------------------- | -| `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. | -| `lang` | unicode | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). | -| `factories` 2 | dict | Factories that create pre-defined pipeline components, e.g. the tagger, parser or entity recognizer, keyed by their component name. | +| Name | Type | Description | +| -------------------------------------- | ----- | ----------------------------------------------------------------------------------------------------------------------------------- | +| `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. | +| `lang` | str | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). | +| `factories` 2 | dict | Factories that create pre-defined pipeline components, e.g. the tagger, parser or entity recognizer, keyed by their component name. | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index f43e17fd3..16cd624f5 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -63,8 +63,8 @@ Lemmatize a string. | Name | Type | Description | | ------------ | ------------- | -------------------------------------------------------------------------------------------------------- | -| `string` | unicode | The string to lemmatize, e.g. the token text. | -| `univ_pos` | unicode / int | The token's universal part-of-speech tag. | +| `string` | str | The string to lemmatize, e.g. the token text. | +| `univ_pos` | str / int | The token's universal part-of-speech tag. | | `morphology` | dict / `None` | Morphological features following the [Universal Dependencies](http://universaldependencies.org/) scheme. | | **RETURNS** | list | The available lemmas for the string. | @@ -82,11 +82,11 @@ original string is returned. Languages can provide a > assert lemmatizer.lookup("going") == "go" > ``` -| Name | Type | Description | -| ----------- | ------- | ----------------------------------------------------------------------------------------------------------- | -| `string` | unicode | The string to look up. | -| `orth` | int | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. | -| **RETURNS** | unicode | The lemma if the string was found, otherwise the original string. | +| Name | Type | Description | +| ----------- | ---- | ----------------------------------------------------------------------------------------------------------- | +| `string` | str | The string to look up. | +| `orth` | int | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. | +| **RETURNS** | str | The lemma if the string was found, otherwise the original string. | ## Lemmatizer.is_base_form {#is_base_form tag="method"} @@ -102,11 +102,11 @@ lemmatization entirely. > assert is_base_form == True > ``` -| Name | Type | Description | -| ------------ | ------------- | --------------------------------------------------------------------------------------- | -| `univ_pos` | unicode / int | The token's universal part-of-speech tag. | -| `morphology` | dict | The token's morphological features. | -| **RETURNS** | bool | Whether the token's part-of-speech tag and morphological features describe a base form. | +| Name | Type | Description | +| ------------ | --------- | --------------------------------------------------------------------------------------- | +| `univ_pos` | str / int | The token's universal part-of-speech tag. | +| `morphology` | dict | The token's morphological features. | +| **RETURNS** | bool | Whether the token's part-of-speech tag and morphological features describe a base form. | ## Attributes {#attributes} diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md index feb167a9d..39148e476 100644 --- a/website/docs/api/lexeme.md +++ b/website/docs/api/lexeme.md @@ -125,21 +125,21 @@ The L2 norm of the lexeme's vector representation. | Name | Type | Description | | -------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `vocab` | `Vocab` | The lexeme's vocabulary. | -| `text` | unicode | Verbatim text content. | +| `text` | str | Verbatim text content. | | `orth` | int | ID of the verbatim text content. | -| `orth_` | unicode | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. | +| `orth_` | str | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. | | `rank` | int | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors. | | `flags` | int | Container of the lexeme's binary flags. | | `norm` | int | The lexemes's norm, i.e. a normalized form of the lexeme text. | -| `norm_` | unicode | The lexemes's norm, i.e. a normalized form of the lexeme text. | +| `norm_` | str | The lexemes's norm, i.e. a normalized form of the lexeme text. | | `lower` | int | Lowercase form of the word. | -| `lower_` | unicode | Lowercase form of the word. | +| `lower_` | str | Lowercase form of the word. | | `shape` | int | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | -| `shape_` | unicode | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | +| `shape_` | str | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | | `prefix` | int | Length-N substring from the start of the word. Defaults to `N=1`. | -| `prefix_` | unicode | Length-N substring from the start of the word. Defaults to `N=1`. | +| `prefix_` | str | Length-N substring from the start of the word. Defaults to `N=1`. | | `suffix` | int | Length-N substring from the end of the word. Defaults to `N=3`. | -| `suffix_` | unicode | Length-N substring from the start of the word. Defaults to `N=3`. | +| `suffix_` | str | Length-N substring from the start of the word. Defaults to `N=3`. | | `is_alpha` | bool | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. | | `is_ascii` | bool | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. | | `is_digit` | bool | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. | @@ -159,7 +159,7 @@ The L2 norm of the lexeme's vector representation. | `is_oov` | bool | Is the lexeme out-of-vocabulary? | | `is_stop` | bool | Is the lexeme part of a "stop list"? | | `lang` | int | Language of the parent vocabulary. | -| `lang_` | unicode | Language of the parent vocabulary. | +| `lang_` | str | Language of the parent vocabulary. | | `prob` | float | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). | | `cluster` | int | Brown cluster ID. | | `sentiment` | float | A scalar value indicating the positivity or negativity of the lexeme. | diff --git a/website/docs/api/lookups.md b/website/docs/api/lookups.md index bd3b38303..b91d92646 100644 --- a/website/docs/api/lookups.md +++ b/website/docs/api/lookups.md @@ -56,10 +56,10 @@ Check if the lookups contain a table of a given name. Delegates to > assert "some_table" in lookups > ``` -| Name | Type | Description | -| ----------- | ------- | ----------------------------------------------- | -| `name` | unicode | Name of the table. | -| **RETURNS** | bool | Whether a table of that name is in the lookups. | +| Name | Type | Description | +| ----------- | ---- | ----------------------------------------------- | +| `name` | str | Name of the table. | +| **RETURNS** | bool | Whether a table of that name is in the lookups. | ## Lookups.tables {#tables tag="property"} @@ -91,7 +91,7 @@ exists. | Name | Type | Description | | ----------- | ----------------------------- | ---------------------------------- | -| `name` | unicode | Unique name of the table. | +| `name` | str | Unique name of the table. | | `data` | dict | Optional data to add to the table. | | **RETURNS** | [`Table`](/api/lookups#table) | The newly added table. | @@ -110,7 +110,7 @@ Get a table from the lookups. Raises an error if the table doesn't exist. | Name | Type | Description | | ----------- | ----------------------------- | ------------------ | -| `name` | unicode | Name of the table. | +| `name` | str | Name of the table. | | **RETURNS** | [`Table`](/api/lookups#table) | The table. | ## Lookups.remove_table {#remove_table tag="method"} @@ -128,7 +128,7 @@ Remove a table from the lookups. Raises an error if the table doesn't exist. | Name | Type | Description | | ----------- | ----------------------------- | ---------------------------- | -| `name` | unicode | Name of the table to remove. | +| `name` | str | Name of the table to remove. | | **RETURNS** | [`Table`](/api/lookups#table) | The removed table. | ## Lookups.has_table {#has_table tag="method"} @@ -144,10 +144,10 @@ Check if the lookups contain a table of a given name. Equivalent to > assert lookups.has_table("some_table") > ``` -| Name | Type | Description | -| ----------- | ------- | ----------------------------------------------- | -| `name` | unicode | Name of the table. | -| **RETURNS** | bool | Whether a table of that name is in the lookups. | +| Name | Type | Description | +| ----------- | ---- | ----------------------------------------------- | +| `name` | str | Name of the table. | +| **RETURNS** | bool | Whether a table of that name is in the lookups. | ## Lookups.to_bytes {#to_bytes tag="method"} @@ -191,9 +191,9 @@ which will be created if it doesn't exist. > lookups.to_disk("/path/to/lookups") > ``` -| Name | Type | Description | -| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | ## Lookups.from_disk {#from_disk tag="method"} @@ -208,10 +208,10 @@ the file doesn't exist. > lookups.from_disk("/path/to/lookups") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| **RETURNS** | `Lookups` | The loaded lookups. | +| Name | Type | Description | +| ----------- | ------------ | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| **RETURNS** | `Lookups` | The loaded lookups. | ## Table {#table tag="class, ordererddict"} @@ -238,7 +238,7 @@ Initialize a new table. | Name | Type | Description | | ----------- | ------- | ---------------------------------- | -| `name` | unicode | Optional table name for reference. | +| `name` | str | Optional table name for reference. | | **RETURNS** | `Table` | The newly constructed object. | ### Table.from_dict {#table.from_dict tag="classmethod"} @@ -256,7 +256,7 @@ Initialize a new table from a dict. | Name | Type | Description | | ----------- | ------- | ---------------------------------- | | `data` | dict | The dictionary. | -| `name` | unicode | Optional table name for reference. | +| `name` | str | Optional table name for reference. | | **RETURNS** | `Table` | The newly constructed object. | ### Table.set {#table.set tag="method"} @@ -273,10 +273,10 @@ Set a new key / value pair. String keys will be hashed. Same as > assert table["foo"] == "bar" > ``` -| Name | Type | Description | -| ------- | ------------- | ----------- | -| `key` | unicode / int | The key. | -| `value` | - | The value. | +| Name | Type | Description | +| ------- | --------- | ----------- | +| `key` | str / int | The key. | +| `value` | - | The value. | ### Table.to_bytes {#table.to_bytes tag="method"} @@ -313,6 +313,6 @@ Load a table from a bytestring. | Name | Type | Description | | -------------- | --------------------------- | ----------------------------------------------------- | -| `name` | unicode | Table name. | +| `name` | str | Table name. | | `default_size` | int | Default size of bloom filters if no data is provided. | | `bloom` | `preshed.bloom.BloomFilter` | The bloom filters. | diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index bfd4fb0ec..8a872558c 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -125,10 +125,10 @@ Check whether the matcher contains rules for a match ID. > assert 'Rule' in matcher > ``` -| Name | Type | Description | -| ----------- | ------- | ----------------------------------------------------- | -| `key` | unicode | The match ID. | -| **RETURNS** | bool | Whether the matcher contains rules for this match ID. | +| Name | Type | Description | +| ----------- | ---- | ----------------------------------------------------- | +| `key` | str | The match ID. | +| **RETURNS** | bool | Whether the matcher contains rules for this match ID. | ## Matcher.add {#add tag="method" new="2"} @@ -153,7 +153,7 @@ overwritten. | Name | Type | Description | | ----------- | ------------------ | --------------------------------------------------------------------------------------------- | -| `match_id` | unicode | An ID for the thing you're matching. | +| `match_id` | str | An ID for the thing you're matching. | | `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | | `*patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. | @@ -188,9 +188,9 @@ exist. > assert "Rule" not in matcher > ``` -| Name | Type | Description | -| ----- | ------- | ------------------------- | -| `key` | unicode | The ID of the match rule. | +| Name | Type | Description | +| ----- | ---- | ------------------------- | +| `key` | str | The ID of the match rule. | ## Matcher.get {#get tag="method" new="2"} @@ -204,7 +204,7 @@ Retrieve the pattern stored for a key. Returns the rule as an > on_match, patterns = matcher.get("Rule") > ``` -| Name | Type | Description | -| ----------- | ------- | --------------------------------------------- | -| `key` | unicode | The ID of the match rule. | -| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. | +| Name | Type | Description | +| ----------- | ----- | --------------------------------------------- | +| `key` | str | The ID of the match rule. | +| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. | diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md index a72277420..fa6729f41 100644 --- a/website/docs/api/phrasematcher.md +++ b/website/docs/api/phrasematcher.md @@ -133,10 +133,10 @@ Check whether the matcher contains rules for a match ID. > assert "OBAMA" in matcher > ``` -| Name | Type | Description | -| ----------- | ------- | ----------------------------------------------------- | -| `key` | unicode | The match ID. | -| **RETURNS** | bool | Whether the matcher contains rules for this match ID. | +| Name | Type | Description | +| ----------- | ---- | ----------------------------------------------------- | +| `key` | str | The match ID. | +| **RETURNS** | bool | Whether the matcher contains rules for this match ID. | ## PhraseMatcher.add {#add tag="method"} @@ -162,7 +162,7 @@ overwritten. | Name | Type | Description | | ---------- | ------------------ | --------------------------------------------------------------------------------------------- | -| `match_id` | unicode | An ID for the thing you're matching. | +| `match_id` | str | An ID for the thing you're matching. | | `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | | `*docs` | `Doc` | `Doc` objects of the phrases to match. | @@ -198,6 +198,6 @@ does not exist. > assert "OBAMA" not in matcher > ``` -| Name | Type | Description | -| ----- | ------- | ------------------------- | -| `key` | unicode | The ID of the match rule. | +| Name | Type | Description | +| ----- | ---- | ------------------------- | +| `key` | str | The ID of the match rule. | diff --git a/website/docs/api/pipeline-functions.md b/website/docs/api/pipeline-functions.md index 6e2b473b1..fc417845c 100644 --- a/website/docs/api/pipeline-functions.md +++ b/website/docs/api/pipeline-functions.md @@ -112,8 +112,8 @@ end of the pipeline and after all other components. -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------------------------ | -| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. | -| `label` | unicode | The subtoken dependency label. Defaults to `"subtok"`. | -| **RETURNS** | `Doc` | The modified `Doc` with merged subtokens. | +| Name | Type | Description | +| ----------- | ----- | ------------------------------------------------------------ | +| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. | +| `label` | str | The subtoken dependency label. Defaults to `"subtok"`. | +| **RETURNS** | `Doc` | The modified `Doc` with merged subtokens. | diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md index c9b935f22..03e843fcc 100644 --- a/website/docs/api/sentencizer.md +++ b/website/docs/api/sentencizer.md @@ -81,9 +81,9 @@ a file `sentencizer.json`. This also happens automatically when you save an > sentencizer.to_disk("/path/to/sentencizer.jsonl") > ``` -| Name | Type | Description | -| ------ | ---------------- | ---------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| ------ | ------------ | ---------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | ## Sentencizer.from_disk {#from_disk tag="method"} @@ -98,10 +98,10 @@ added to its pipeline. > sentencizer.from_disk("/path/to/sentencizer.json") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. | -| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. | +| Name | Type | Description | +| ----------- | ------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. | +| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. | ## Sentencizer.to_bytes {#to_bytes tag="method"} diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 3833bbca9..c41d9aa03 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -110,7 +110,7 @@ For details, see the documentation on | Name | Type | Description | | --------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------- | -| `name` | unicode | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `span._.my_attr`. | +| `name` | str | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `span._.my_attr`. | | `default` | - | Optional default value of the attribute if no getter or method is defined. | | `method` | callable | Set a custom method on the object, for example `span._.compare(other_span)`. | | `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. | @@ -132,10 +132,10 @@ Look up a previously registered extension by name. Returns a 4-tuple > assert extension == (False, None, None, None) > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------------------------- | -| `name` | unicode | Name of the extension. | -| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | +| Name | Type | Description | +| ----------- | ----- | ------------------------------------------------------------- | +| `name` | str | Name of the extension. | +| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | ## Span.has_extension {#has_extension tag="classmethod" new="2"} @@ -149,10 +149,10 @@ Check whether an extension has been registered on the `Span` class. > assert Span.has_extension("is_city") > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------ | -| `name` | unicode | Name of the extension to check. | -| **RETURNS** | bool | Whether the extension has been registered. | +| Name | Type | Description | +| ----------- | ---- | ------------------------------------------ | +| `name` | str | Name of the extension to check. | +| **RETURNS** | bool | Whether the extension has been registered. | ## Span.remove_extension {#remove_extension tag="classmethod" new="2.0.12"} @@ -167,10 +167,10 @@ Remove a previously registered extension. > assert not Span.has_extension("is_city") > ``` -| Name | Type | Description | -| ----------- | ------- | --------------------------------------------------------------------- | -| `name` | unicode | Name of the extension. | -| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | +| Name | Type | Description | +| ----------- | ----- | --------------------------------------------------------------------- | +| `name` | str | Name of the extension. | +| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | ## Span.char_span {#char_span tag="method" new="2.2.4"} @@ -497,16 +497,16 @@ The L2 norm of the span's vector representation. | `end` | int | The token offset for the end of the span. | | `start_char` | int | The character offset for the start of the span. | | `end_char` | int | The character offset for the end of the span. | -| `text` | unicode | A unicode representation of the span text. | -| `text_with_ws` | unicode | The text content of the span with a trailing whitespace character if the last token has one. | +| `text` | str | A unicode representation of the span text. | +| `text_with_ws` | str | The text content of the span with a trailing whitespace character if the last token has one. | | `orth` | int | ID of the verbatim text content. | -| `orth_` | unicode | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. | +| `orth_` | str | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. | | `label` | int | The hash value of the span's label. | -| `label_` | unicode | The span's label. | -| `lemma_` | unicode | The span's lemma. | +| `label_` | str | The span's label. | +| `lemma_` | str | The span's lemma. | | `kb_id` | int | The hash value of the knowledge base ID referred to by the span. | -| `kb_id_` | unicode | The knowledge base ID referred to by the span. | +| `kb_id_` | str | The knowledge base ID referred to by the span. | | `ent_id` | int | The hash value of the named entity the token is an instance of. | -| `ent_id_` | unicode | The string ID of the named entity the token is an instance of. | +| `ent_id_` | str | The string ID of the named entity the token is an instance of. | | `sentiment` | float | A scalar value indicating the positivity or negativity of the span. | | `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | diff --git a/website/docs/api/stringstore.md b/website/docs/api/stringstore.md index 268f19125..922174c78 100644 --- a/website/docs/api/stringstore.md +++ b/website/docs/api/stringstore.md @@ -55,7 +55,7 @@ Retrieve a string from a given hash, or vice versa. | Name | Type | Description | | -------------- | ------------------------ | -------------------------- | | `string_or_id` | bytes, unicode or uint64 | The value to encode. | -| **RETURNS** | unicode or int | The value to be retrieved. | +| **RETURNS** | str or int | The value to be retrieved. | ## StringStore.\_\_contains\_\_ {#contains tag="method"} @@ -69,10 +69,10 @@ Check whether a string is in the store. > assert not "cherry" in stringstore > ``` -| Name | Type | Description | -| ----------- | ------- | -------------------------------------- | -| `string` | unicode | The string to check. | -| **RETURNS** | bool | Whether the store contains the string. | +| Name | Type | Description | +| ----------- | ---- | -------------------------------------- | +| `string` | str | The string to check. | +| **RETURNS** | bool | Whether the store contains the string. | ## StringStore.\_\_iter\_\_ {#iter tag="method"} @@ -87,9 +87,9 @@ store will always include an empty string `''` at position `0`. > assert all_strings == ["apple", "orange"] > ``` -| Name | Type | Description | -| ---------- | ------- | ---------------------- | -| **YIELDS** | unicode | A string in the store. | +| Name | Type | Description | +| ---------- | ---- | ---------------------- | +| **YIELDS** | str | A string in the store. | ## StringStore.add {#add tag="method" new="2"} @@ -106,10 +106,10 @@ Add a string to the `StringStore`. > assert stringstore["banana"] == banana_hash > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------------ | -| `string` | unicode | The string to add. | -| **RETURNS** | uint64 | The string's hash value. | +| Name | Type | Description | +| ----------- | ------ | ------------------------ | +| `string` | str | The string to add. | +| **RETURNS** | uint64 | The string's hash value. | ## StringStore.to_disk {#to_disk tag="method" new="2"} @@ -121,9 +121,9 @@ Save the current state to a directory. > stringstore.to_disk("/path/to/strings") > ``` -| Name | Type | Description | -| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | ## StringStore.from_disk {#from_disk tag="method" new="2"} @@ -136,10 +136,10 @@ Loads state from a directory. Modifies the object in place and returns it. > stringstore = StringStore().from_disk("/path/to/strings") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| **RETURNS** | `StringStore` | The modified `StringStore` object. | +| Name | Type | Description | +| ----------- | ------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| **RETURNS** | `StringStore` | The modified `StringStore` object. | ## StringStore.to_bytes {#to_bytes tag="method"} @@ -185,7 +185,7 @@ Get a 64-bit hash for a given string. > assert hash_string("apple") == 8566208034543834098 > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------- | -| `string` | unicode | The string to hash. | -| **RETURNS** | uint64 | The hash. | +| Name | Type | Description | +| ----------- | ------ | ------------------- | +| `string` | str | The string to hash. | +| **RETURNS** | uint64 | The hash. | diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index bd3382f89..f14da3ac5 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -229,10 +229,10 @@ Add a new label to the pipe. > tagger.add_label("MY_LABEL", {POS: 'NOUN'}) > ``` -| Name | Type | Description | -| -------- | ------- | --------------------------------------------------------------- | -| `label` | unicode | The label to add. | -| `values` | dict | Optional values to map to the label, e.g. a tag map dictionary. | +| Name | Type | Description | +| -------- | ---- | --------------------------------------------------------------- | +| `label` | str | The label to add. | +| `values` | dict | Optional values to map to the label, e.g. a tag map dictionary. | ## Tagger.to_disk {#to_disk tag="method"} @@ -245,10 +245,10 @@ Serialize the pipe to disk. > tagger.to_disk("/path/to/tagger") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## Tagger.from_disk {#from_disk tag="method"} @@ -261,11 +261,11 @@ Load the pipe from disk. Modifies the object in place and returns it. > tagger.from_disk("/path/to/tagger") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tagger` | The modified `Tagger` object. | +| Name | Type | Description | +| ----------- | ------------ | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tagger` | The modified `Tagger` object. | ## Tagger.to_bytes {#to_bytes tag="method"} diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index 1a0280265..dc1c083ac 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -44,7 +44,7 @@ shortcut for this and instantiate the component using its string name and | `vocab` | `Vocab` | The shared vocabulary. | | `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. | | `exclusive_classes` | bool | Make categories mutually exclusive. Defaults to `False`. | -| `architecture` | unicode | Model architecture to use, see [architectures](#architectures) for details. Defaults to `"ensemble"`. | +| `architecture` | str | Model architecture to use, see [architectures](#architectures) for details. Defaults to `"ensemble"`. | | **RETURNS** | `TextCategorizer` | The newly constructed object. | ### Architectures {#architectures new="2.1"} @@ -247,9 +247,9 @@ Add a new label to the pipe. > textcat.add_label("MY_LABEL") > ``` -| Name | Type | Description | -| ------- | ------- | ----------------- | -| `label` | unicode | The label to add. | +| Name | Type | Description | +| ------- | ---- | ----------------- | +| `label` | str | The label to add. | ## TextCategorizer.to_disk {#to_disk tag="method"} @@ -262,10 +262,10 @@ Serialize the pipe to disk. > textcat.to_disk("/path/to/textcat") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## TextCategorizer.from_disk {#from_disk tag="method"} @@ -280,7 +280,7 @@ Load the pipe from disk. Modifies the object in place and returns it. | Name | Type | Description | | ----------- | ----------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. | diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index 7462af739..c71f849ad 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -34,15 +34,15 @@ the > tokenizer = nlp.Defaults.create_tokenizer(nlp) > ``` -| Name | Type | Description | -| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | A storage container for lexical types. | -| `rules` | dict | Exceptions and special-cases for the tokenizer. | -| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | -| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | -| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | -| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. | -| **RETURNS** | `Tokenizer` | The newly constructed object. | +| Name | Type | Description | +| ---------------- | ----------- | ------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | A storage container for lexical types. | +| `rules` | dict | Exceptions and special-cases for the tokenizer. | +| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | +| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | +| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | +| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. | +| **RETURNS** | `Tokenizer` | The newly constructed object. | ## Tokenizer.\_\_call\_\_ {#call tag="method"} @@ -55,10 +55,10 @@ Tokenize a string. > assert len(tokens) == 4 > ``` -| Name | Type | Description | -| ----------- | ------- | --------------------------------------- | -| `string` | unicode | The string to tokenize. | -| **RETURNS** | `Doc` | A container for linguistic annotations. | +| Name | Type | Description | +| ----------- | ----- | --------------------------------------- | +| `string` | str | The string to tokenize. | +| **RETURNS** | `Doc` | A container for linguistic annotations. | ## Tokenizer.pipe {#pipe tag="method"} @@ -82,20 +82,20 @@ Tokenize a stream of texts. Find internal split points of the string. -| Name | Type | Description | -| ----------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | -| `string` | unicode | The string to split. | -| **RETURNS** | list | A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. | +| Name | Type | Description | +| ----------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------- | +| `string` | str | The string to split. | +| **RETURNS** | list | A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. | ## Tokenizer.find_prefix {#find_prefix tag="method"} Find the length of a prefix that should be segmented from the string, or `None` if no prefix rules match. -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------------------ | -| `string` | unicode | The string to segment. | -| **RETURNS** | int | The length of the prefix if present, otherwise `None`. | +| Name | Type | Description | +| ----------- | ---- | ------------------------------------------------------ | +| `string` | str | The string to segment. | +| **RETURNS** | int | The length of the prefix if present, otherwise `None`. | ## Tokenizer.find_suffix {#find_suffix tag="method"} @@ -104,7 +104,7 @@ if no suffix rules match. | Name | Type | Description | | ----------- | ------------ | ------------------------------------------------------ | -| `string` | unicode | The string to segment. | +| `string` | str | The string to segment. | | **RETURNS** | int / `None` | The length of the suffix if present, otherwise `None`. | ## Tokenizer.add_special_case {#add_special_case tag="method"} @@ -125,7 +125,7 @@ and examples. | Name | Type | Description | | ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `string` | unicode | The string to specially tokenize. | +| `string` | str | The string to specially tokenize. | | `token_attrs` | iterable | A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. | ## Tokenizer.explain {#explain tag="method"} @@ -142,10 +142,10 @@ produced are identical to `Tokenizer.__call__` except for whitespace tokens. > assert [t[1] for t in tok_exp] == ["(", "do", "n't", ")"] > ``` -| Name | Type | Description | -| ------------| -------- | --------------------------------------------------- | -| `string` | unicode | The string to tokenize with the debugging tokenizer | -| **RETURNS** | list | A list of `(pattern_string, token_string)` tuples | +| Name | Type | Description | +| ----------- | ---- | --------------------------------------------------- | +| `string` | str | The string to tokenize with the debugging tokenizer | +| **RETURNS** | list | A list of `(pattern_string, token_string)` tuples | ## Tokenizer.to_disk {#to_disk tag="method"} @@ -158,10 +158,10 @@ Serialize the tokenizer to disk. > tokenizer.to_disk("/path/to/tokenizer") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## Tokenizer.from_disk {#from_disk tag="method"} @@ -174,11 +174,11 @@ Load the tokenizer from disk. Modifies the object in place and returns it. > tokenizer.from_disk("/path/to/tokenizer") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. | +| Name | Type | Description | +| ----------- | ------------ | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. | ## Tokenizer.to_bytes {#to_bytes tag="method"} @@ -217,14 +217,14 @@ it. ## Attributes {#attributes} -| Name | Type | Description | -| ---------------- | ------- | --------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | -| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. | -| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. | -| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. | -| `token_match` | - | A function matching the signature of `re.compile(string).match to find token matches. Returns an `re.MatchObject` or `None. | -| `rules` | dict | A dictionary of tokenizer exceptions and special cases. | +| Name | Type | Description | +| ---------------- | ------- | -------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | +| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. | +| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. | +| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. | +| `token_match` | - | A function matching the signature of `re.compile(string).match to find token matches. Returns an`re.MatchObject`or`None. | +| `rules` | dict | A dictionary of tokenizer exceptions and special cases. | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 2360ad472..bdd094021 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -32,11 +32,11 @@ class. The data will be loaded in via > nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"]) > ``` -| Name | Type | Description | -| ----------- | ---------------- | --------------------------------------------------------------------------------- | -| `name` | unicode / `Path` | Model to load, i.e. shortcut link, package name or path. | -| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | -| **RETURNS** | `Language` | A `Language` object with the loaded model. | +| Name | Type | Description | +| ----------- | ------------ | --------------------------------------------------------------------------------- | +| `name` | str / `Path` | Model to load, i.e. shortcut link, package name or path. | +| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | +| **RETURNS** | `Language` | A `Language` object with the loaded model. | Essentially, `spacy.load()` is a convenience wrapper that reads the language ID and pipeline components from a model's `meta.json`, initializes the `Language` @@ -79,7 +79,7 @@ Create a blank model of a given language class. This function is the twin of | Name | Type | Description | | ----------- | ---------- | ------------------------------------------------------------------------------------------------ | -| `name` | unicode | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. | +| `name` | str | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. | | `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | | **RETURNS** | `Language` | An empty `Language` object of the appropriate subclass. | @@ -98,10 +98,10 @@ meta data as a dictionary instead, you can use the `meta` attribute on your > spacy.info("de", markdown=True) > ``` -| Name | Type | Description | -| ---------- | ------- | ------------------------------------------------------------- | -| `model` | unicode | A model, i.e. shortcut link, package name or path (optional). | -| `markdown` | bool | Print information as Markdown. | +| Name | Type | Description | +| ---------- | ---- | ------------------------------------------------------------- | +| `model` | str | A model, i.e. shortcut link, package name or path (optional). | +| `markdown` | bool | Print information as Markdown. | ### spacy.explain {#spacy.explain tag="function"} @@ -122,10 +122,10 @@ list of available terms, see > # world NN noun, singular or mass > ``` -| Name | Type | Description | -| ----------- | ------- | -------------------------------------------------------- | -| `term` | unicode | Term to explain. | -| **RETURNS** | unicode | The explanation, or `None` if not found in the glossary. | +| Name | Type | Description | +| ----------- | ---- | -------------------------------------------------------- | +| `term` | str | Term to explain. | +| **RETURNS** | str | The explanation, or `None` if not found in the glossary. | ### spacy.prefer_gpu {#spacy.prefer_gpu tag="function" new="2.0.14"} @@ -189,13 +189,13 @@ browser. Will run a simple web server. | Name | Type | Description | Default | | --------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | ----------- | | `docs` | list, `Doc`, `Span` | Document(s) to visualize. | -| `style` | unicode | Visualization style, `'dep'` or `'ent'`. | `'dep'` | +| `style` | str | Visualization style, `'dep'` or `'ent'`. | `'dep'` | | `page` | bool | Render markup as full HTML page. | `True` | | `minify` | bool | Minify HTML markup. | `False` | | `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` | | `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` | | `port` | int | Port to serve visualization. | `5000` | -| `host` | unicode | Host to serve visualization. | `'0.0.0.0'` | +| `host` | str | Host to serve visualization. | `'0.0.0.0'` | ### displacy.render {#displacy.render tag="method" new="2"} @@ -214,13 +214,13 @@ Render a dependency parse tree or named entity visualization. | Name | Type | Description | Default | | ----------- | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- | | `docs` | list, `Doc`, `Span` | Document(s) to visualize. | -| `style` | unicode | Visualization style, `'dep'` or `'ent'`. | `'dep'` | +| `style` | str | Visualization style, `'dep'` or `'ent'`. | `'dep'` | | `page` | bool | Render markup as full HTML page. | `False` | | `minify` | bool | Minify HTML markup. | `False` | | `jupyter` | bool | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None`. | `None` | | `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` | | `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` | -| **RETURNS** | unicode | Rendered HTML markup. | +| **RETURNS** | str | Rendered HTML markup. | ### Visualizer options {#displacy_options} @@ -236,22 +236,22 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="dep", options=options) > ``` -| Name | Type | Description | Default | -| ------------------------------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- | -| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` | -| `add_lemma` 2.2.4 | bool | Print the lemma's in a separate row below the token texts. | `False` | -| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` | -| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` | -| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | -| `color` | unicode | Text color (HEX, RGB or color names). | `'#000000'` | -| `bg` | unicode | Background color (HEX, RGB or color names). | `'#ffffff'` | -| `font` | unicode | Font name or font family for all text. | `'Arial'` | -| `offset_x` | int | Spacing on left side of the SVG in px. | `50` | -| `arrow_stroke` | int | Width of arrow path in px. | `2` | -| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) | -| `arrow_spacing` | int | Spacing between arrows in px to avoid overlaps. | `20` / `12` (compact) | -| `word_spacing` | int | Vertical spacing between words and arcs in px. | `45` | -| `distance` | int | Distance between words in px. | `175` / `150` (compact) | +| Name | Type | Description | Default | +| ------------------------------------------ | ---- | --------------------------------------------------------------------------------------------------------------- | ----------------------- | +| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` | +| `add_lemma` 2.2.4 | bool | Print the lemma's in a separate row below the token texts. | `False` | +| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` | +| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` | +| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | +| `color` | str | Text color (HEX, RGB or color names). | `'#000000'` | +| `bg` | str | Background color (HEX, RGB or color names). | `'#ffffff'` | +| `font` | str | Font name or font family for all text. | `'Arial'` | +| `offset_x` | int | Spacing on left side of the SVG in px. | `50` | +| `arrow_stroke` | int | Width of arrow path in px. | `2` | +| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) | +| `arrow_spacing` | int | Spacing between arrows in px to avoid overlaps. | `20` / `12` (compact) | +| `word_spacing` | int | Vertical spacing between words and arcs in px. | `45` | +| `distance` | int | Distance between words in px. | `175` / `150` (compact) | #### Named Entity Visualizer options {#displacy_options-ent} @@ -263,11 +263,11 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="ent", options=options) > ``` -| Name | Type | Description | Default | -| --------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ | -| `ents` | list | Entity types to highlight (`None` for all types). | `None` | -| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` | -| `template` 2.2 | unicode | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) | +| Name | Type | Description | Default | +| --------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ | +| `ents` | list | Entity types to highlight (`None` for all types). | `None` | +| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` | +| `template` 2.2 | str | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) | By default, displaCy comes with colors for all [entity types supported by spaCy](/api/annotation#named-entities). If you're @@ -308,9 +308,9 @@ Set custom path to the data directory where spaCy looks for models. > # PosixPath('/custom/path') > ``` -| Name | Type | Description | -| ------ | ---------------- | --------------------------- | -| `path` | unicode / `Path` | Path to new data directory. | +| Name | Type | Description | +| ------ | ------------ | --------------------------- | +| `path` | str / `Path` | Path to new data directory. | ### util.get_lang_class {#util.get_lang_class tag="function"} @@ -330,7 +330,7 @@ you can use the [`set_lang_class`](/api/top-level#util.set_lang_class) helper. | Name | Type | Description | | ----------- | ---------- | -------------------------------------- | -| `lang` | unicode | Two-letter language code, e.g. `'en'`. | +| `lang` | str | Two-letter language code, e.g. `'en'`. | | **RETURNS** | `Language` | Language class. | ### util.set_lang_class {#util.set_lang_class tag="function"} @@ -352,7 +352,7 @@ the two-letter language code. | Name | Type | Description | | ------ | ---------- | -------------------------------------- | -| `name` | unicode | Two-letter language code, e.g. `'en'`. | +| `name` | str | Two-letter language code, e.g. `'en'`. | | `cls` | `Language` | The language class, e.g. `English`. | ### util.lang_class_is_loaded {#util.lang_class_is_loaded tag="function" new="2.1"} @@ -368,10 +368,10 @@ loaded lazily, to avoid expensive setup code associated with the language data. > assert util.lang_class_is_loaded("de") is False > ``` -| Name | Type | Description | -| ----------- | ------- | -------------------------------------- | -| `name` | unicode | Two-letter language code, e.g. `'en'`. | -| **RETURNS** | bool | Whether the class has been loaded. | +| Name | Type | Description | +| ----------- | ---- | -------------------------------------- | +| `name` | str | Two-letter language code, e.g. `'en'`. | +| **RETURNS** | bool | Whether the class has been loaded. | ### util.load_model {#util.load_model tag="function" new="2"} @@ -392,7 +392,7 @@ in via [`Language.from_disk()`](/api/language#from_disk). | Name | Type | Description | | ------------- | ---------- | -------------------------------------------------------- | -| `name` | unicode | Package name, shortcut link or model path. | +| `name` | str | Package name, shortcut link or model path. | | `**overrides` | - | Specific overrides, like pipeline components to disable. | | **RETURNS** | `Language` | `Language` class with the loaded model. | @@ -411,7 +411,7 @@ it easy to test a new model that you haven't packaged yet. | Name | Type | Description | | ------------- | ---------- | ---------------------------------------------------------------------------------------------------- | -| `model_path` | unicode | Path to model data directory. | +| `model_path` | str | Path to model data directory. | | `meta` | dict | Model meta data. If `False`, spaCy will try to load the meta from a meta.json in the same directory. | | `**overrides` | - | Specific overrides, like pipeline components to disable. | | **RETURNS** | `Language` | `Language` class with the loaded model. | @@ -432,7 +432,7 @@ A helper function to use in the `load()` method of a model package's | Name | Type | Description | | ------------- | ---------- | -------------------------------------------------------- | -| `init_file` | unicode | Path to model's `__init__.py`, i.e. `__file__`. | +| `init_file` | str | Path to model's `__init__.py`, i.e. `__file__`. | | `**overrides` | - | Specific overrides, like pipeline components to disable. | | **RETURNS** | `Language` | `Language` class with the loaded model. | @@ -446,10 +446,10 @@ Get a model's meta.json from a directory path and validate its contents. > meta = util.get_model_meta("/path/to/model") > ``` -| Name | Type | Description | -| ----------- | ---------------- | ------------------------ | -| `path` | unicode / `Path` | Path to model directory. | -| **RETURNS** | dict | The model's meta data. | +| Name | Type | Description | +| ----------- | ------------ | ------------------------ | +| `path` | str / `Path` | Path to model directory. | +| **RETURNS** | dict | The model's meta data. | ### util.is_package {#util.is_package tag="function"} @@ -463,10 +463,10 @@ Check if string maps to a package installed via pip. Mainly used to validate > util.is_package("xyz") # False > ``` -| Name | Type | Description | -| ----------- | ------- | -------------------------------------------- | -| `name` | unicode | Name of package. | -| **RETURNS** | `bool` | `True` if installed package, `False` if not. | +| Name | Type | Description | +| ----------- | ------ | -------------------------------------------- | +| `name` | str | Name of package. | +| **RETURNS** | `bool` | `True` if installed package, `False` if not. | ### util.get_package_path {#util.get_package_path tag="function" new="2"} @@ -480,10 +480,10 @@ Get path to an installed package. Mainly used to resolve the location of > # /usr/lib/python3.6/site-packages/en_core_web_sm > ``` -| Name | Type | Description | -| -------------- | ------- | -------------------------------- | -| `package_name` | unicode | Name of installed package. | -| **RETURNS** | `Path` | Path to model package directory. | +| Name | Type | Description | +| -------------- | ------ | -------------------------------- | +| `package_name` | str | Name of installed package. | +| **RETURNS** | `Path` | Path to model package directory. | ### util.is_in_jupyter {#util.is_in_jupyter tag="function" new="2"} diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index a4c36f8cd..939cc8655 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -35,7 +35,7 @@ you can add vectors to later. | `data` | `ndarray[ndim=1, dtype='float32']` | The vector data. | | `keys` | iterable | A sequence of keys aligned with the data. | | `shape` | tuple | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. | -| `name` | unicode | A name to identify the vectors table. | +| `name` | str | A name to identify the vectors table. | | **RETURNS** | `Vectors` | The newly created object. | ## Vectors.\_\_getitem\_\_ {#getitem tag="method"} @@ -140,7 +140,7 @@ mapping separately. If you need to manage the strings, you should use the | Name | Type | Description | | ----------- | ---------------------------------- | ----------------------------------------------------- | -| `key` | unicode / int | The key to add. | +| `key` | str / int | The key to add. | | `vector` | `ndarray[ndim=1, dtype='float32']` | An optional vector to add for the key. | | `row` | int | An optional row number of a vector to map the key to. | | **RETURNS** | int | The row the vector was added to. | @@ -227,7 +227,7 @@ Look up one or more keys by row, or vice versa. | Name | Type | Description | | ----------- | ------------------------------------- | ------------------------------------------------------------------------ | -| `key` | unicode / int | Find the row that the given key points to. Returns int, `-1` if missing. | +| `key` | str / int | Find the row that the given key points to. Returns int, `-1` if missing. | | `keys` | iterable | Find rows that the keys point to. Returns `ndarray`. | | `row` | int | Find the first key that points to the row. Returns int. | | `rows` | iterable | Find the keys that point to the rows. Returns ndarray. | @@ -337,9 +337,9 @@ Save the current state to a directory. > > ``` -| Name | Type | Description | -| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | ## Vectors.from_disk {#from_disk tag="method"} @@ -352,10 +352,10 @@ Loads state from a directory. Modifies the object in place and returns it. > vectors.from_disk("/path/to/vectors") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| **RETURNS** | `Vectors` | The modified `Vectors` object. | +| Name | Type | Description | +| ----------- | ------------ | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| **RETURNS** | `Vectors` | The modified `Vectors` object. | ## Vectors.to_bytes {#to_bytes tag="method"} diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index e024ab54a..b851f6882 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -27,7 +27,7 @@ Create the vocabulary. | `tag_map` | dict | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. | | `lemmatizer` | object | A lemmatizer. Defaults to `None`. | | `strings` | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. | -| `vectors_name` 2.2 | unicode | A name to identify the vectors table. | +| `vectors_name` 2.2 | str | A name to identify the vectors table. | | **RETURNS** | `Vocab` | The newly constructed object. | ## Vocab.\_\_len\_\_ {#len tag="method"} @@ -91,10 +91,10 @@ given string, you need to look it up in > assert oov not in nlp.vocab > ``` -| Name | Type | Description | -| ----------- | ------- | -------------------------------------------------- | -| `string` | unicode | The ID string. | -| **RETURNS** | bool | Whether the string has an entry in the vocabulary. | +| Name | Type | Description | +| ----------- | ---- | -------------------------------------------------- | +| `string` | str | The ID string. | +| **RETURNS** | bool | Whether the string has an entry in the vocabulary. | ## Vocab.add_flag {#add_flag tag="method"} @@ -117,7 +117,7 @@ using `token.check_flag(flag_id)`. | Name | Type | Description | | ------------- | ---- | ----------------------------------------------------------------------------------------------------------------------------------------------- | -| `flag_getter` | dict | A function `f(unicode) -> bool`, to get the flag value. | +| `flag_getter` | dict | A function `f(str) -> bool`, to get the flag value. | | `flag_id` | int | An integer between 1 and 63 (inclusive), specifying the bit at which the flag will be stored. If `-1`, the lowest available bit will be chosen. | | **RETURNS** | int | The integer ID by which the flag value can be checked. | @@ -227,10 +227,10 @@ Save the current state to a directory. > nlp.vocab.to_disk("/path/to/vocab") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## Vocab.from_disk {#from_disk tag="method" new="2"} @@ -243,11 +243,11 @@ Loads state from a directory. Modifies the object in place and returns it. > vocab = Vocab().from_disk("/path/to/vocab") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Vocab` | The modified `Vocab` object. | +| Name | Type | Description | +| ----------- | ------------ | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Vocab` | The modified `Vocab` object. | ## Vocab.to_bytes {#to_bytes tag="method"} diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index d17e5a661..4b3c61b9d 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -327,11 +327,11 @@ displaCy in our [online demo](https://explosion.ai/demos/displacy).. ### Disabling the parser {#disabling} In the [default models](/models), the parser is loaded and enabled as part of -the [standard processing pipeline](/usage/processing-pipelines). If you don't need -any of the syntactic information, you should disable the parser. Disabling the -parser will make spaCy load and run much faster. If you want to load the parser, -but need to disable it for specific documents, you can also control its use on -the `nlp` object. +the [standard processing pipeline](/usage/processing-pipelines). If you don't +need any of the syntactic information, you should disable the parser. Disabling +the parser will make spaCy load and run much faster. If you want to load the +parser, but need to disable it for specific documents, you can also control its +use on the `nlp` object. ```python nlp = spacy.load("en_core_web_sm", disable=["parser"]) @@ -988,10 +988,10 @@ nlp = spacy.load("en_core_web_sm") nlp.tokenizer = my_tokenizer ``` -| Argument | Type | Description | -| ----------- | ------- | ------------------------- | -| `text` | unicode | The raw text to tokenize. | -| **RETURNS** | `Doc` | The tokenized document. | +| Argument | Type | Description | +| ----------- | ----- | ------------------------- | +| `text` | str | The raw text to tokenize. | +| **RETURNS** | `Doc` | The tokenized document. | diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 696e11106..e7aca3981 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -272,16 +272,16 @@ doc = nlp("I won't have named entities") disabled.restore() ``` -If you want to disable all pipes except for one or a few, you can use the `enable` -keyword. Just like the `disable` keyword, it takes a list of pipe names, or a string -defining just one pipe. +If you want to disable all pipes except for one or a few, you can use the +`enable` keyword. Just like the `disable` keyword, it takes a list of pipe +names, or a string defining just one pipe. + ```python # Enable only the parser with nlp.select_pipes(enable="parser"): doc = nlp("I will only be parsed") ``` - Finally, you can also use the [`remove_pipe`](/api/language#remove_pipe) method to remove pipeline components from an existing pipeline, the [`rename_pipe`](/api/language#rename_pipe) method to rename them, or the @@ -349,12 +349,12 @@ last** in the pipeline, or define a **custom name**. If no name is set and no > nlp.add_pipe(my_component, before="parser") > ``` -| Argument | Type | Description | -| -------- | ------- | ------------------------------------------------------------------------ | -| `last` | bool | If set to `True`, component is added **last** in the pipeline (default). | -| `first` | bool | If set to `True`, component is added **first** in the pipeline. | -| `before` | unicode | String name of component to add the new component **before**. | -| `after` | unicode | String name of component to add the new component **after**. | +| Argument | Type | Description | +| -------- | ---- | ------------------------------------------------------------------------ | +| `last` | bool | If set to `True`, component is added **last** in the pipeline (default). | +| `first` | bool | If set to `True`, component is added **first** in the pipeline. | +| `before` | str | String name of component to add the new component **before**. | +| `after` | str | String name of component to add the new component **after**. | ### Example: A simple pipeline component {#custom-components-simple} diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 5f47bd2e3..a84399312 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -157,19 +157,19 @@ The available token pattern keys correspond to a number of [`Token` attributes](/api/token#attributes). The supported attributes for rule-based matching are: -| Attribute | Type |  Description | -| -------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------ | -| `ORTH` | unicode | The exact verbatim text of a token. | -| `TEXT` 2.1 | unicode | The exact verbatim text of a token. | -| `LOWER` | unicode | The lowercase form of the token text. | -|  `LENGTH` | int | The length of the token text. | -|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. | -|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. | -|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. | -|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. | -|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | unicode | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. | -| `ENT_TYPE` | unicode | The token's entity label. | -| `_` 2.1 | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). | +| Attribute | Type |  Description | +| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ | +| `ORTH` | str | The exact verbatim text of a token. | +| `TEXT` 2.1 | str | The exact verbatim text of a token. | +| `LOWER` | str | The lowercase form of the token text. | +|  `LENGTH` | int | The length of the token text. | +|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. | +|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. | +|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. | +|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. | +|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. | +| `ENT_TYPE` | str | The token's entity label. | +| `_` 2.1 | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). | @@ -1101,21 +1101,28 @@ powerful model packages with binary weights _and_ rules included! ### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"} -When using a large amount of **phrase patterns** (roughly > 10000) it's useful to understand how the `add_patterns` function of the EntityRuler works. For each **phrase pattern**, -the EntityRuler calls the nlp object to construct a doc object. This happens in case you try -to add the EntityRuler at the end of an existing pipeline with, for example, a POS tagger and want to -extract matches based on the pattern's POS signature. +When using a large amount of **phrase patterns** (roughly > 10000) it's useful +to understand how the `add_patterns` function of the EntityRuler works. For each +**phrase pattern**, the EntityRuler calls the nlp object to construct a doc +object. This happens in case you try to add the EntityRuler at the end of an +existing pipeline with, for example, a POS tagger and want to extract matches +based on the pattern's POS signature. -In this case you would pass a config value of `phrase_matcher_attr="POS"` for the EntityRuler. +In this case you would pass a config value of `phrase_matcher_attr="POS"` for +the EntityRuler. -Running the full language pipeline across every pattern in a large list scales linearly and can therefore take a long time on large amounts of phrase patterns. +Running the full language pipeline across every pattern in a large list scales +linearly and can therefore take a long time on large amounts of phrase patterns. -As of spaCy 2.2.4 the `add_patterns` function has been refactored to use nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with 5,000-100,000 phrase patterns respectively. +As of spaCy 2.2.4 the `add_patterns` function has been refactored to use +nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with +5,000-100,000 phrase patterns respectively. -Even with this speedup (but especially if you're using an older version) the `add_patterns` function can still take a long time. +Even with this speedup (but especially if you're using an older version) the +`add_patterns` function can still take a long time. -An easy workaround to make this function run faster is disabling the other language pipes -while adding the phrase patterns. +An easy workaround to make this function run faster is disabling the other +language pipes while adding the phrase patterns. ```python entityruler = EntityRuler(nlp) diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index c94c79360..c0dbfc732 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -94,8 +94,8 @@ docs = list(doc_bin.get_docs(nlp.vocab)) If `store_user_data` is set to `True`, the `Doc.user_data` will be serialized as well, which includes the values of -[extension attributes](/usage/processing-pipelines#custom-components-attributes) (if -they're serializable with msgpack). +[extension attributes](/usage/processing-pipelines#custom-components-attributes) +(if they're serializable with msgpack). @@ -666,10 +666,10 @@ and lets you customize how the model should be initialized and loaded. You can define the language data to be loaded and the [processing pipeline](/usage/processing-pipelines) to execute. -| Setting | Type | Description | -| ---------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `lang` | unicode | ID of the language class to initialize. | -| `pipeline` | list | A list of strings mapping to the IDs of pipeline factories to apply in that order. If not set, spaCy's [default pipeline](/usage/processing-pipelines) will be used. | +| Setting | Type | Description | +| ---------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `lang` | str | ID of the language class to initialize. | +| `pipeline` | list | A list of strings mapping to the IDs of pipeline factories to apply in that order. If not set, spaCy's [default pipeline](/usage/processing-pipelines) will be used. | The `load()` method that comes with our model package templates will take care of putting all this together and returning a `Language` object with the loaded diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md index dd0b0eb50..9733e09c2 100644 --- a/website/docs/usage/visualizers.md +++ b/website/docs/usage/visualizers.md @@ -67,12 +67,12 @@ arcs. -| Argument | Type | Description | Default | -| --------- | ------- | ----------------------------------------------------------- | ----------- | -| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | -| `color` | unicode | Text color (HEX, RGB or color names). | `"#000000"` | -| `bg` | unicode | Background color (HEX, RGB or color names). | `"#ffffff"` | -| `font` | unicode | Font name or font family for all text. | `"Arial"` | +| Argument | Type | Description | Default | +| --------- | ---- | ----------------------------------------------------------- | ----------- | +| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | +| `color` | str | Text color (HEX, RGB or color names). | `"#000000"` | +| `bg` | str | Background color (HEX, RGB or color names). | `"#ffffff"` | +| `font` | str | Font name or font family for all text. | `"Arial"` | For a list of all available options, see the [`displacy` API documentation](/api/top-level#displacy_options).