spaCy/website/docs/api/util.jade

287 lines
8.3 KiB
Plaintext

//- 💫 DOCS > API > UTIL
include ../../_includes/_mixins
p
| spaCy comes with a small collection of utility functions located in
| #[+src(gh("spaCy", "spacy/util.py")) spacy/util.py].
| Because utility functions are mostly intended for
| #[strong internal use within spaCy], their behaviour may change with
| future releases. The functions documented on this page should be safe
| to use and we'll try to ensure backwards compatibility. However, we
| recommend having additional tests in place if your application depends on
| any of spaCy's utilities.
+h(2, "get_data_path") util.get_data_path
+tag function
p
| Get path to the data directory where spaCy looks for models. Defaults to
| #[code spacy/data].
+table(["Name", "Type", "Description"])
+row
+cell #[code require_exists]
+cell bool
+cell Only return path if it exists, otherwise return #[code None].
+footrow
+cell returns
+cell #[code Path] / #[code None]
+cell Data path or #[code None].
+h(2, "set_data_path") util.set_data_path
+tag function
p
| Set custom path to the data directory where spaCy looks for models.
+aside-code("Example").
util.set_data_path('/custom/path')
util.get_data_path()
# PosixPath('/custom/path')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell Path to new data directory.
+h(2, "get_lang_class") util.get_lang_class
+tag function
p
| Import and load a #[code Language] class. Allows lazy-loading
| #[+a("/docs/usage/adding-languages") language data] and importing
| languages using the two-letter language code.
+aside-code("Example").
for lang_id in ['en', 'de']:
lang_class = util.get_lang_class(lang_id)
lang = lang_class()
tokenizer = lang.Defaults.create_tokenizer()
+table(["Name", "Type", "Description"])
+row
+cell #[code lang]
+cell unicode
+cell Two-letter language code, e.g. #[code 'en'].
+footrow
+cell returns
+cell #[code Language]
+cell Language class.
+h(2, "load_model") util.load_model
+tag function
+tag-new(2)
p
| Load a model from a shortcut link, package or data path. If called with a
| shortcut link or package name, spaCy will assume the model is a Python
| package and import and call its #[code load()] method. If called with a
| path, spaCy will assume it's a data directory, read the language and
| pipeline settings from the meta.json and initialise a #[code Language]
| class. The model data will then be loaded in via
| #[+api("language#from_disk") #[code Language.from_disk()]].
+aside-code("Example").
nlp = util.load_model('en')
nlp = util.load_model('en_core_web_sm', disable=['ner'])
nlp = util.load_model('/path/to/data')
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell Package name, shortcut link or model path.
+row
+cell #[code **overrides]
+cell -
+cell Specific overrides, like pipeline components to disable.
+footrow
+cell returns
+cell #[code Language]
+cell #[code Language] class with the loaded model.
+h(2, "load_model_from_init_py") util.load_model_from_init_py
+tag function
+tag-new(2)
p
| A helper function to use in the #[code load()] method of a model package's
| #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py].
+aside-code("Example").
from spacy.util import load_model_from_init_py
def load(**overrides):
return load_model_from_init_py(__file__, **overrides)
+table(["Name", "Type", "Description"])
+row
+cell #[code init_file]
+cell unicode
+cell Path to model's __init__.py, i.e. #[code __file__].
+row
+cell #[code **overrides]
+cell -
+cell Specific overrides, like pipeline components to disable.
+footrow
+cell returns
+cell #[code Language]
+cell #[code Language] class with the loaded model.
+h(2, "get_model_meta") util.get_model_meta
+tag function
+tag-new(2)
p
| Get a model's meta.json from a directory path and validate its contents.
+aside-code("Example").
meta = util.get_model_meta('/path/to/model')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell Path to model directory.
+footrow
+cell returns
+cell dict
+cell The model's meta data.
+h(2, "is_package") util.is_package
+tag function
p
| Check if string maps to a package installed via pip. Mainly used to
| validate #[+a("/docs/usage/models") model packages].
+aside-code("Example").
util.is_package('en_core_web_sm') # True
util.is_package('xyz') # False
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell Name of package.
+footrow
+cell returns
+cell #[code bool]
+cell #[code True] if installed package, #[code False] if not.
+h(2, "get_package_path") util.get_package_path
+tag function
+tag-new(2)
p
| Get path to an installed package. Mainly used to resolve the location of
| #[+a("/docs/usage/models") model packages]. Currently imports the package
| to find its path.
+aside-code("Example").
util.get_package_path('en_core_web_sm')
# /usr/lib/python3.6/site-packages/en_core_web_sm
+table(["Name", "Type", "Description"])
+row
+cell #[code package_name]
+cell unicode
+cell Name of installed package.
+footrow
+cell returns
+cell #[code Path]
+cell Path to model package directory.
+h(2, "is_in_jupyter") util.is_in_jupyter
+tag function
+tag-new(2)
p
| Check if user is running spaCy from a #[+a("https://jupyter.org") Jupyter]
| notebook by detecting the IPython kernel. Mainly used for the
| #[+api("displacy") #[code displacy]] visualizer.
+aside-code("Example").
html = '<h1>Hello world!</h1>'
if util.is_in_jupyter():
from IPython.core.display import display, HTML
return display(HTML(html))
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell bool
+cell #[code True] if in Jupyter, #[code False] if not.
+h(2, "update_exc") util.update_exc
+tag function
p
| Update, validate and overwrite
| #[+a("/docs/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions].
| Used to combine global exceptions with custom, language-specific
| exceptions. Will raise an error if key doesn't match #[code ORTH] values.
+aside-code("Example").
BASE = {"a.": [{ORTH: "a."}], ":)": [{ORTH: ":)"}]}
NEW = {"a.": [{ORTH: "a.", LEMMA: "all"}]}
exceptions = util.update_exc(BASE, NEW)
# {"a.": [{ORTH: "a.", LEMMA: "all"}], ":)": [{ORTH: ":)"}]}
+table(["Name", "Type", "Description"])
+row
+cell #[code base_exceptions]
+cell dict
+cell Base tokenizer exceptions.
+row
+cell #[code *addition_dicts]
+cell dicts
+cell Exception dictionaries to add to the base exceptions, in order.
+footrow
+cell returns
+cell dict
+cell Combined tokenizer exceptions.
+h(2, "prints") util.prints
+tag function
+tag-new(2)
p
| Print a formatted, text-wrapped message with optional title. If a text
| argument is a #[code Path], it's converted to a string. Should only
| be used for interactive components like the #[+api("cli") cli].
+aside-code("Example").
data_path = Path('/some/path')
if not path.exists():
util.prints("Can't find the path.", data_path,
title="Error", exits=1)
+table(["Name", "Type", "Description"])
+row
+cell #[code *texts]
+cell unicode
+cell Texts to print. Each argument is rendered as paragraph.
+row
+cell #[code **kwargs]
+cell -
+cell
| #[code title] is rendered as coloured headline. #[code exits]
| performs system exit after printing, using the value of the
| argument as the exit code, e.g. #[code exits=1].