Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-05-26 11:31:41 -05:00
commit 8af3100143
40 changed files with 856 additions and 444 deletions

View File

@ -173,13 +173,13 @@ class Language(object):
flat_list.append(pipe)
self.pipeline = flat_list
def __call__(self, text, **disabled):
def __call__(self, text, disable=[]):
"""'Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string
is preserved.
text (unicode): The text to be processed.
**disabled: Elements of the pipeline that should not be run.
disable (list): Names of the pipeline components to disable.
RETURNS (Doc): A container for accessing the annotations.
EXAMPLE:
@ -190,7 +190,7 @@ class Language(object):
doc = self.make_doc(text)
for proc in self.pipeline:
name = getattr(proc, 'name', None)
if name in disabled and not disabled[name]:
if name in disable:
continue
proc(doc)
return doc
@ -322,7 +322,7 @@ class Language(object):
except StopIteration:
pass
def pipe(self, texts, n_threads=2, batch_size=1000, **disabled):
def pipe(self, texts, n_threads=2, batch_size=1000, disable=[]):
"""Process texts as a stream, and yield `Doc` objects in order. Supports
GIL-free multi-threading.
@ -330,7 +330,7 @@ class Language(object):
n_threads (int): The number of worker threads to use. If -1, OpenMP will
decide how many to use at run time. Default is 2.
batch_size (int): The number of texts to buffer.
**disabled: Pipeline components to exclude.
disable (list): Names of the pipeline components to disable.
YIELDS (Doc): Documents in the order of the original text.
EXAMPLE:
@ -342,7 +342,7 @@ class Language(object):
docs = texts
for proc in self.pipeline:
name = getattr(proc, 'name', None)
if name in disabled and not disabled[name]:
if name in disable:
continue
if hasattr(proc, 'pipe'):
docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
@ -352,12 +352,14 @@ class Language(object):
for doc in docs:
yield doc
def to_disk(self, path, **exclude):
"""Save the current state to a directory.
def to_disk(self, path, disable=[]):
"""Save the current state to a directory. If a model is loaded, this
will include the model.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
**exclude: Named attributes to prevent from being saved.
disable (list): Nameds of pipeline components to disable and prevent
from being saved.
EXAMPLE:
>>> nlp.to_disk('/path/to/models')
@ -369,7 +371,7 @@ class Language(object):
raise IOError("Output path must be a directory")
props = {}
for name, value in self.__dict__.items():
if name in exclude:
if name in disable:
continue
if hasattr(value, 'to_disk'):
value.to_disk(path / name)
@ -378,13 +380,14 @@ class Language(object):
with (path / 'props.pickle').open('wb') as file_:
dill.dump(props, file_)
def from_disk(self, path, **exclude):
def from_disk(self, path, disable=[]):
"""Loads state from a directory. Modifies the object in place and
returns it.
returns it. If the saved `Language` object contains a model, the
model will be loaded.
path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
**exclude: Named attributes to prevent from being loaded.
disable (list): Names of the pipeline components to disable.
RETURNS (Language): The modified `Language` object.
EXAMPLE:
@ -393,35 +396,36 @@ class Language(object):
"""
path = util.ensure_path(path)
for name in path.iterdir():
if name not in exclude and hasattr(self, str(name)):
if name not in disable and hasattr(self, str(name)):
getattr(self, name).from_disk(path / name)
with (path / 'props.pickle').open('rb') as file_:
bytes_data = file_.read()
self.from_bytes(bytes_data, **exclude)
self.from_bytes(bytes_data, disable)
return self
def to_bytes(self, **exclude):
def to_bytes(self, disable=[]):
"""Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized.
disable (list): Nameds of pipeline components to disable and prevent
from being serialized.
RETURNS (bytes): The serialized form of the `Language` object.
"""
props = dict(self.__dict__)
for key in exclude:
for key in disable:
if key in props:
props.pop(key)
return dill.dumps(props, -1)
def from_bytes(self, bytes_data, **exclude):
def from_bytes(self, bytes_data, disable=[]):
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded.
disable (list): Names of the pipeline components to disable.
RETURNS (Language): The `Language` object.
"""
props = dill.loads(bytes_data)
for key, value in props.items():
if key not in exclude:
if key not in disable:
setattr(self, key, value)
return self

View File

@ -174,6 +174,7 @@ def get_async(stream, numpy_array):
array.set(numpy_array, stream=stream)
return array
def itershuffle(iterable, bufsize=1000):
"""Shuffle an iterator. This works by holding `bufsize` items back
and yielding them sometime later. Obviously, this is not unbiased --

View File

@ -37,7 +37,8 @@ mixin svg(file, name, width, height)
size - [integer] icon width and height (default: 20)
mixin icon(name, size)
+svg("icons", name, size || 20).o-icon&attributes(attributes)
- var size = size || 20
+svg("icons", name, size).o-icon(style="min-width: #{size}px")&attributes(attributes)
//- Pro/Con/Neutral icon
@ -185,3 +186,14 @@ mixin landing-header()
mixin landing-badge(url, graphic, alt, size)
+a(url)(aria-label=alt title=alt).c-landing__badge
+svg("graphics", graphic, size || 225)
//- Under construction (temporary)
Marks sections that still need to be completed for the v2.0 release.
mixin under-construction()
+infobox("🚧 Under construction")
| This section is still being written and will be updated for the v2.0
| release. Is there anything that you think should definitely mentioned or
| explained here? Any examples you'd like to see? #[strong Let us know]
| on the #[+a(gh("spacy") + "/issues") v2.0 alpha thread] on GitHub!

View File

@ -178,7 +178,7 @@ mixin label()
//- Tag
mixin tag()
span.u-text-tag.u-text-tag--spaced(aria-hidden="true")
span.u-text-tag.u-text-tag--spaced(aria-hidden="true")&attributes(attributes)
block
@ -192,6 +192,17 @@ mixin tag-model(...capabs)
+help(intro + ext + ".").u-color-theme
//- "New" tag to label features new in a specific version
By using a separate mixin with a version ID, it becomes easy to quickly
enable/disable tags without having to modify the markup in the docs.
version - [string or integer] version number, without "v" prefix
mixin tag-new(version)
- var version = (typeof version == 'number') ? version.toFixed(1) : version
+tag(data-tooltip="This feature is new and was introduced in spaCy v#{version}.")
| v#{version}
//- List
type - [string] "numbers", "letters", "roman" (bulleted list if none set)
start - [integer] start number

View File

@ -6,10 +6,18 @@ include _sidebar
main.o-main.o-main--sidebar.o-main--aside
article.o-content
+grid.o-no-block
+grid-col(source ? "two-thirds" : "full")
+h(1)=title
if tag
+tag=tag
if source
+grid-col("third").u-text-right
.o-inline-list
+button(gh("spacy", source), false, "secondary").u-text-tag Source #[+icon("code", 14)]
if ALPHA
+infobox("⚠️ You are viewing the spaCy v2.0 alpha docs")
| This page is part of the alpha documentation for spaCy v2.0

View File

@ -1,128 +1,128 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-1 -1 907 737" width="906" height="736">
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-1 -1 907 737" width="906" height="746">
<style>
.text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
.text-medium { fill: #1a1e23; font: 17px "Source Sans Pro" }
.text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro" }
.text-code { fill: #1a1e23; font: 600 12px "Source Code Pro" }
.svg__architecture__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
.svg__architecture__text-medium { fill: #1a1e23; font: 17px "Source Sans Pro" }
.svg__architecture__text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro" }
.svg__architecture__text-code { fill: #1a1e23; font: 600 12px "Source Code Pro" }
</style>
<ellipse cx="404" cy="203" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="text-large" transform="translate(362.5 206.5)" width="81" height="40">Language</text>
<text class="svg__architecture__text-large" transform="translate(362.5 206.5)" width="81" height="40">Language</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M345 432v242.8" stroke-dasharray="2 2"/>
<path fill="#82b366" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M345 680.8l-4-8 4 2 4-2z"/>
<rect fill="#fff" transform="translate(324 535.5)" width="37" height="18"/>
<text class="text-small" dy="1em" style="fill: #82b366" transform="translate(324 535.5)" width="37" height="18">MAKES</text>
<text class="svg__architecture__text-small" dy="1em" style="fill: #82b366" transform="translate(324 535.5)" width="37" height="18">MAKES</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M457 434l100.5 80"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M562.3 517.6l-8.8-1.8 4-2 1-4.3z"/>
<rect fill="#f6f6f6" transform="translate(424.5 462.5)" width="158" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(424.5 462.5)" width="158" height="18">nlp.vocab.morphology</text>
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(424.5 462.5)" width="158" height="18">nlp.vocab.morphology</text>
<ellipse cx="404" cy="399" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="text-large" dy="1em" transform="translate(377.5 386.5)" width="51" height="22">Vocab</text>
<text class="svg__architecture__text-large" dy="1em" transform="translate(377.5 386.5)" width="51" height="22">Vocab</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 253v87.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 346.8l-4-8 4 2 4-2z"/>
<rect fill="#f6f6f6" transform="translate(364.5 285.5)" width="79" height="18" />
<text class="text-code" dy="1em" dx="0.5em" transform="translate(364.5 285.5)" width="79" height="18">nlp.vocab</text>
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(364.5 285.5)" width="79" height="18">nlp.vocab</text>
<ellipse cx="743" cy="399" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="text-large" transform="translate(694.5 386.5)" dy="1em" width="95" height="22">StringStore</text>
<text class="svg__architecture__text-large" transform="translate(694.5 386.5)" dy="1em" width="95" height="22">StringStore</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M478 399h181.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M665.8 399l-8 4 2-4-2-4z"/>
<rect fill="#f6f6f6" transform="translate(498.5 388.5)" width="137" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(498.5 388.5)" width="137" height="18">nlp.vocab.strings</text>
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(498.5 388.5)" width="137" height="18">nlp.vocab.strings</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M108 244l235.6 115.4"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M349 362h-9l3.6-2.6V355z"/>
<rect fill="#f6f6f6" transform="translate(141.5 284.5)" width="151" height="18" />
<text class="text-code" dy="1em" dx="0.5em" transform="translate(141.5 284.5)" width="151" height="18">nlp.tokenizer.vocab</text>
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(141.5 284.5)" width="151" height="18">nlp.tokenizer.vocab</text>
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M38.7 159.3H104l33 43.6-32.8 43.5H38.7L6 203z"/>
<text class="text-large" transform="translate(30.5 190.5)" dy="1em" width="80" height="22">Tokenizer</text>
<text class="svg__architecture__text-large" transform="translate(30.5 190.5)" dy="1em" width="80" height="22">Tokenizer</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M329 203v-1H145.2"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M139.2 202l8-4-2 4 2 4z"/>
<rect fill="#f6f6f6" transform="translate(188.5 191.5)" width="115" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(188.5 191.5)" width="115" height="18">nlp.make_doc()</text>
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(188.5 191.5)" width="115" height="18">nlp.make_doc()</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M478 203h83v-4h105.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M672.8 199l-8 4 2-4-2-4z"/>
<rect fill="#f6f6f6" transform="translate(512.5 191.5)" width="101" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(512.5 191.5)" width="101" height="18">nlp.pipeline</text>
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(512.5 191.5)" width="101" height="18">nlp.pipeline</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M709 242.8L464.4 359.4"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M459 362l5.5-7v4.4l3.5 2.8z"/>
<rect fill="#f6f6f6" transform="translate(505.5 297.5)" width="166" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(505.5 297.5)" width="166" height="18">nlp.pipeline[i].vocab</text>
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(505.5 297.5)" width="166" height="18">nlp.pipeline[i].vocab</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M275.3 34.6L288.6 1h54L329 34.6z"/>
<text class="text-small" dy="0.85em" transform="translate(301.5 9.5)" width="12" height="14">pt</text>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(301.5 9.5)" width="12" height="14">pt</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M60.8 34.6L74.3 1h54l-13.6 33.6z"/>
<text class="text-small" dy="0.85em" transform="translate(86.5 9.5)" width="14" height="14">en</text>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(86.5 9.5)" width="14" height="14">en</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M114.4 34.6L128 1h53.8l-13.5 33.6z"/>
<text class="text-small" dy="0.85em" transform="translate(140.5 9.5)" width="14" height="14">de</text>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(140.5 9.5)" width="14" height="14">de</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M168 34.6L181.5 1h54l-13.6 33.6z"/>
<text class="text-small" dy="0.85em" transform="translate(196.5 9.5)" width="8" height="14">fr</text>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(196.5 9.5)" width="8" height="14">fr</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M221.6 34.6L235 1h54l-13.5 33.6z"/>
<text class="text-small" dy="0.85em" transform="translate(248.5 9.5)" width="12" height="14">es</text>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(248.5 9.5)" width="12" height="14">es</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M47 68.3l13.6-33.6h53.8L101 68.3z"/>
<text class="text-small" dy="0.85em" transform="translate(75.5 43.5)" width="8" height="14">it</text>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(75.5 43.5)" width="8" height="14">it</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M100.7 68.3l13.5-33.6H168l-13.4 33.6z"/>
<text class="text-small" dy="0.85em" transform="translate(127.5 43.5)" width="12" height="14">nl</text>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(127.5 43.5)" width="12" height="14">nl</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M154.3 68.3l13.5-33.6h53.8l-13.4 33.6z"/>
<text class="text-small" dy="0.85em" transform="translate(180.5 43.5)" width="12" height="14">sv</text>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(180.5 43.5)" width="12" height="14">sv</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M208 68.3l13.4-33.6h53.8L262 68.3z"/>
<text class="text-small" dy="0.85em" transform="translate(236.5 43.5)" width="8" height="14">fi</text>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(236.5 43.5)" width="8" height="14">fi</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M261.5 68.3L275 34.7h54l-13.6 33.6z"/>
<text class="text-small" dy="0.85em" transform="translate(286.5 43.5)" width="16" height="14">nb</text>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(286.5 43.5)" width="16" height="14">nb</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M33.4 102L47 68.2h53.7L87.3 102z"/>
<text class="text-small" dy="0.85em" transform="translate(58.5 77.5)" width="16" height="14">hu</text>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(58.5 77.5)" width="16" height="14">hu</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M87 102l13.5-33.7h53.8L141 102z"/>
<text class="text-small" dy="0.85em" transform="translate(112.5 77.5)" width="14" height="14">he</text>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(112.5 77.5)" width="14" height="14">he</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M140.6 102L154 68.2h54L194.4 102z"/>
<text class="text-small" dy="0.85em" transform="translate(165.5 77.5)" width="16" height="14">bn</text>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(165.5 77.5)" width="16" height="14">bn</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M194.2 102l13.5-33.7h53.8L248 102z"/>
<text transform="translate(221.5 77.5)" class="text-small" dy="0.85em" width="10" height="14">ja</text>
<text transform="translate(221.5 77.5)" class="svg__architecture__text-small" dy="0.85em" width="10" height="14">ja</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M247.8 102l13.5-33.7H315L301.8 102z"/>
<text class="text-small" dy="0.85em" transform="translate(273.5 77.5)" width="14" height="14">zh</text>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(273.5 77.5)" width="14" height="14">zh</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M329 51h75v93.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 150.8l-4-8 4 2 4-2z"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M213 480l109.3-76.3"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M327.2 400.3L323 408l-.8-4.3-4-2z"/>
<rect fill="#f6f6f6" transform="translate(226.5 431.5)" width="79" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(226.5 431.5)" width="79" height="18">doc.vocab</text>
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(226.5 431.5)" width="79" height="18">doc.vocab</text>
<path fill="none" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M39.6 555.5l.4 121.3" stroke-dasharray="2 2"/>
<path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M40 682.8l-4-8 4 2 4-2z"/>
<rect fill="#fff" transform="translate(23.5 604.5)" width="37" height="18"/>
<text class="text-small" style="fill: #9673a6" dy="1em" dx="-0.5em" transform="translate(23.5 604.5)" width="37" height="18">MAKES</text>
<text class="svg__architecture__text-small" style="fill: #9673a6" dy="1em" dx="-0.5em" transform="translate(23.5 604.5)" width="37" height="18">MAKES</text>
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M1 479.5h283v74.8H1z"/>
<text class="text-large" dy="1em" transform="translate(125.5 504.5)" width="32" height="22">Doc</text>
<text class="svg__architecture__text-large" dy="1em" transform="translate(125.5 504.5)" width="32" height="22">Doc</text>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M71 246v117h1v108.8" stroke-dasharray="2 2"/>
<path fill="#c00" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M72 477.8l-4-8 4 2 4-2z"/>
<rect fill="#fff" transform="translate(54.5 355.5)" width="37" height="18"/>
<text class="text-small" style="fill: #cc0000" dy="1em" dx="-0.5em" transform="translate(54.5 355.5)" width="37" height="18">MAKES</text>
<text class="svg__architecture__text-small" style="fill: #cc0000" dy="1em" dx="-0.5em" transform="translate(54.5 355.5)" width="37" height="18">MAKES</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M104 685l.4-121.2"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M104.5 557.8l4 8-4-2-4 2z"/>
<rect fill="#f6f6f6" transform="translate(62.5 632.5)" width="79" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(62.5 632.5)" width="79" height="18">token.doc</text>
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(62.5 632.5)" width="79" height="18">token.doc</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M7.2 685h129.6v50H7.2z"/>
<text class="text-medium" dy="1em" transform="translate(49.5 700.5)" width="43" height="18">Token</text>
<text class="svg__architecture__text-medium" dy="1em" transform="translate(49.5 700.5)" width="43" height="18">Token</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M148 685h129.7v50H148z"/>
<text class="text-medium" dy="1em" transform="translate(193.5 700.5)" width="37" height="18">Span</text>
<text class="svg__architecture__text-medium" dy="1em" transform="translate(193.5 700.5)" width="37" height="18">Span</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M405 686V456.6"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M405 450.6l4 8-4-2-4 2z"/>
<rect fill="#f6f6f6" transform="translate(356.5 584.5)" width="101" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(356.5 584.5)" width="101" height="18">lexeme.vocab</text>
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(356.5 584.5)" width="101" height="18">lexeme.vocab</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M296.7 685h155.8v50H296.7z"/>
<text class="text-medium" dy="1em" transform="translate(346.5 700.5)" width="55" height="18">Lexeme</text>
<text class="svg__architecture__text-medium" dy="1em" transform="translate(346.5 700.5)" width="55" height="18">Lexeme</text>
<path fill="none" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M180.5 559.3l.5 117.5" stroke-dasharray="2 2"/>
<path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M181 682.8l-4-8 4 2 4-2z"/>
<rect fill="#fff" transform="translate(164.5 606.5)" width="37" height="18" />
<text class="text-small" style="fill: #9673a6" dy="1em" dx="-0.5em" transform="translate(164.5 606.5)" width="37" height="18">MAKES</text>
<text class="svg__architecture__text-small" style="fill: #9673a6" dy="1em" dx="-0.5em" transform="translate(164.5 606.5)" width="37" height="18">MAKES</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M245.3 685V564"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M245.3 557.8l4 8-4-2-4 2z"/>
<rect fill="#f6f6f6" transform="translate(211.5 633.5)" width="72" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(211.5 633.5)" width="72" height="18">span.doc</text>
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(211.5 633.5)" width="72" height="18">span.doc</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.6 112H872l32.8 43.5L872 199h-65.4L774 155.6z"/>
<text class="text-medium" dy="1em" transform="translate(794.5 135.5)" width="88" height="38">Dependency <tspan dy="1.25em" dx="-4.1em">Parser</tspan></text>
<text class="svg__architecture__text-medium" dy="1em" transform="translate(794.5 135.5)" width="88" height="38">Dependency <tspan dy="1.25em" dx="-4.1em">Parser</tspan></text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.6 199H872l32.8 43.8-32.8 43.6h-65.4L774 242.8z"/>
<text class="text-medium" dy="1em" dx="1.1em" transform="translate(799.5 222.5)" width="78" height="38">Entity <tspan dy="1.25em" dx="-3.75em">Recognizer</tspan></text>
<text class="svg__architecture__text-medium" dy="1em" dx="1.1em" transform="translate(799.5 222.5)" width="78" height="38">Entity <tspan dy="1.25em" dx="-3.75em">Recognizer</tspan></text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M708 155.5h65.6l32.7 43.6-32.7 43.8H708L675.5 199z"/>
<text class="text-medium" dy="1em" transform="translate(715.5 189.5)" width="48" height="18">Tagger</text>
<text class="svg__architecture__text-medium" dy="1em" transform="translate(715.5 189.5)" width="48" height="18">Tagger</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.8 24.5h65.5L905 68 872.3 112h-65.5L774 68z"/>
<text class="text-medium" dy="1em" transform="translate(809.5 58.5)" width="58" height="18">Matcher</text>
<text class="svg__architecture__text-medium" dy="1em" transform="translate(809.5 58.5)" width="58" height="18">Matcher</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M708.6 68H774l32.8 43.5L774 155h-65.4L676 111.6z"/>
<text class="text-medium" dy="1em" transform="translate(698.5 101.5)" width="84" height="18">Lemmatizer</text>
<text class="svg__architecture__text-medium" dy="1em" transform="translate(698.5 101.5)" width="84" height="18">Lemmatizer</text>
<ellipse cx="617" cy="555" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="text-large" dy="1em" transform="translate(565.5 542.5)" width="101" height="22">Morphology</text>
<text class="svg__architecture__text-large" dy="1em" transform="translate(565.5 542.5)" width="101" height="22">Morphology</text>
</svg>

Before

Width:  |  Height:  |  Size: 14 KiB

After

Width:  |  Height:  |  Size: 14 KiB

View File

@ -1,13 +1,13 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="931" height="456" viewBox="-1 -1 932 480" preserveAspectRatio="xMinYMin meet">
<style>
.text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
.text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
.text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
.svg__langdata__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
.svg__langdata__text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
.svg__langdata__text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
</style>
<path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M610 404h-69.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M534.2 404l8-4-2 4 2 4z"/>
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M642.7 361.3H708l33 43.6-33 43.5H643L610 405z"/>
<text class="text-large" transform="translate(634.5 410)" width="80" height="22">Tokenizer</text>
<text class="svg__langdata__text-large" transform="translate(634.5 410)" width="80" height="22">Tokenizer</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M780 303H621v-56.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M621 240.2l4 8-4-2-4 2z"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M855 253v-20.8"/>
@ -17,7 +17,7 @@
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M780 303H504v-56.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M504 240.2l4 8-4-2-4 2z"/>
<ellipse cx="855" cy="303" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="text-large" transform="translate(815 308)" width="119" height="46">Base data</text>
<text class="svg__langdata__text-large" transform="translate(815 308)" width="119" height="46">Base data</text>
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100l.4 39.8"/>
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389.5 145.8l-4-8 4 2 4-2z"/>
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100v17h232v22.8"/>
@ -33,50 +33,50 @@
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100v17H46v22.8"/>
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M46 145.8l-4-8 4 2 4-2z"/>
<ellipse cx="389" cy="50" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="text-large" transform="translate(350 42)" width="81" height="46">Language <tspan dy="1.45em" dx="-3.25em">data</tspan></text>
<text class="svg__langdata__text-large" transform="translate(350 42)" width="81" height="46">Language <tspan dy="1.45em" dx="-3.25em">data</tspan></text>
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M435 193h15.8"/>
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M456.8 193l-8 4 2-4-2-4z"/>
<ellipse cx="390" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="text-small" transform="translate(375 187.5)" width="39" height="30">stop <tspan dx="-2.5em" dy="1.25em">words</tspan></text>
<text class="svg__langdata__text-small" transform="translate(375 187.5)" width="39" height="30">stop <tspan dx="-2.5em" dy="1.25em">words</tspan></text>
<path fill="none" stroke="#9673a6" stroke-width="3" stroke-miterlimit="10" d="M472 225l-1.5 133.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M470.4 364.8l-4-8 4 2 4-2z"/>
<ellipse cx="504" cy="193" fill="#f5f5f5" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="text-small" transform="translate(481 187.5)" width="85" height="30">lexical <tspan dx="-3.75em" dy="1.25em">attributes</tspan></text>
<text class="svg__langdata__text-small" transform="translate(481 187.5)" width="85" height="30">lexical <tspan dx="-3.75em" dy="1.25em">attributes</tspan></text>
<path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M653 225l5.6 127.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M659 358.8l-4.5-8 4 2 4-2.2z"/>
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M576 193h-18.8"/>
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M551.2 193l8-4-2 4 2 4z"/>
<ellipse cx="621" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="text-small" transform="translate(588 187.5)" width="85" height="30">tokenizer <tspan dx="-4.6em" dy="1.25em">exceptions</tspan></text>
<text class="svg__langdata__text-small" transform="translate(588 187.5)" width="85" height="30">tokenizer <tspan dx="-4.6em" dy="1.25em">exceptions</tspan></text>
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M690 193h-15.8"/>
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M668.2 193l8-4-2 4 2 4z"/>
<path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M703 225l-10.3 127.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M692.2 358.8l-3.4-8.3 4 2.3 4-1.7z"/>
<ellipse cx="735" cy="193" fill="#f5f5f5" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="text-small" transform="translate(705 182)" width="53" height="46">prefixes, <tspan dy="1.25em" dx="-3.9em">suffixes,</tspan> <tspan dy="1.25em" dx="-3.6em">infixes</tspan>
<text class="svg__langdata__text-small" transform="translate(705 182)" width="53" height="46">prefixes, <tspan dy="1.25em" dx="-3.9em">suffixes,</tspan> <tspan dy="1.25em" dx="-3.6em">infixes</tspan>
</text>
<path fill="none" stroke="#d79b00" stroke-width="3" stroke-miterlimit="10" d="M280 238v114.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#d79b00" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M280 358.8l-4-8 4 2 4-2z"/>
<ellipse cx="280" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="text-small" transform="translate(254 187.5)" width="71" height="30">lemma <tspan dy="1.25em" dx="-2.7em">data</tspan></text>
<text class="svg__langdata__text-small" transform="translate(254 187.5)" width="71" height="30">lemma <tspan dy="1.25em" dx="-2.7em">data</tspan></text>
<path fill="none" stroke="#d79b00" stroke-width="3" stroke-miterlimit="10" d="M346 404h53.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#d79b00" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M405.8 404l-8 4 2-4-2-4z"/>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M247.7 361.3H313l33 43.6-33 43.5h-65.3L215 405z"/>
<text class="text-large" transform="translate(232 410)" width="100" height="22">Lemmatizer</text>
<text class="svg__langdata__text-large" transform="translate(232 410)" width="100" height="22">Lemmatizer</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M823 193h-34.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M782.2 193l8-4-2 4 2 4z"/>
<ellipse cx="855" cy="193" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="31.5" ry="31.5"/>
<text class="text-tiny" transform="translate(838 189)" width="50" height="30">char <tspan dy="1.1em" dx="-2.75em">classes</tspan></text>
<text class="svg__langdata__text-tiny" transform="translate(838 189)" width="50" height="30">char <tspan dy="1.1em" dx="-2.75em">classes</tspan></text>
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M408 367h124v74H408z"/>
<text class="text-large" transform="translate(443.5 410)" width="51" height="22">Token</text>
<text class="svg__langdata__text-large" transform="translate(443.5 410)" width="51" height="22">Token</text>
<path fill="none" stroke="#666" stroke-width="3" stroke-miterlimit="10" d="M131 225l-21 122.2" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M109 353l-2.5-8.5 3.6 2.7 4.4-1.3z"/>
<ellipse cx="163" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="text-small" transform="translate(139 187.5)" width="45" height="30">morph <tspan dy="1.25em" dx="-2.8em">rules</tspan></text>
<text class="svg__langdata__text-small" transform="translate(139 187.5)" width="45" height="30">morph <tspan dy="1.25em" dx="-2.8em">rules</tspan></text>
<path fill="none" stroke="#666" stroke-width="3" stroke-miterlimit="10" d="M78 225l15.4 122" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M94.2 353l-5-7.5 4.2 1.5 3.7-2.5z"/>
<ellipse cx="46" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="text-small" transform="translate(33 187.5)" width="27" height="30">tag <tspan dy="1.25em" dx="-1.8em">map</tspan></text>
<text class="svg__langdata__text-small" transform="translate(33 187.5)" width="27" height="30">tag <tspan dy="1.25em" dx="-1.8em">map</tspan></text>
<ellipse cx="101" cy="405" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.5" ry="49.5"/>
<text class="text-large" transform="translate(49.5 410)" width="100" height="22">Morphology</text>
<text class="svg__langdata__text-large" transform="translate(49.5 410)" width="100" height="22">Morphology</text>
</svg>

Before

Width:  |  Height:  |  Size: 8.8 KiB

After

Width:  |  Height:  |  Size: 9.0 KiB

View File

@ -1,30 +1,30 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 923 200" width="923" height="200">
<style>
.text { fill: #1a1e23; font: 20px "Source Sans Pro" }
.text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro" }
.text-code { fill: #1a1e23; font: 600 16px "Source Code Pro" }
.svg__pipeline__text { fill: #1a1e23; font: 20px "Source Sans Pro" }
.svg__pipeline__text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro" }
.svg__pipeline__text-code { fill: #1a1e23; font: 600 16px "Source Code Pro" }
</style>
<rect width="601" height="127" x="159" y="21" fill="none" stroke="#09a3d5" stroke-width="3" rx="19.1" stroke-dasharray="3 6" ry="19.1"/>
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M801 55h120v60H801z"/>
<text class="text" dy="0.75em" width="28" height="19" transform="translate(846.5 75.5)">Doc</text>
<text class="svg__pipeline__text" dy="0.75em" width="28" height="19" transform="translate(846.5 75.5)">Doc</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M121.2 84.7h29.4"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M156.6 84.7l-8 4 2-4-2-4z"/>
<path fill="#f5f5f5" stroke="#999" stroke-width="2" d="M1 55h120v60H1z"/>
<text class="text" dy="0.85em" width="34" height="22" transform="translate(43.5 73.5)">Text</text>
<text class="svg__pipeline__text" dy="0.85em" width="34" height="22" transform="translate(43.5 73.5)">Text</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M760 84.7h33"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M799 84.7l-8 4 2-4-2-4z"/>
<rect width="75" height="39" x="422" y="1" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="5.8" ry="5.8"/>
<text class="text-code" dy="0.8em" dx="0.1em" width="29" height="17" transform="translate(444.5 11.5)">nlp</text>
<text class="svg__pipeline__text-code" dy="0.8em" dx="0.1em" width="29" height="17" transform="translate(444.5 11.5)">nlp</text>
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M176 58h103.3L296 88l-16.8 30H176l16.8-30z"/>
<text class="text-small" dy="0.75em" dx="-0.25em" width="58" height="14" transform="translate(206.5 80.5)">tokenizer</text>
<text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="58" height="14" transform="translate(206.5 80.5)">tokenizer</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M314 58h103.3L434 88l-16.8 30H314l16.8-30z"/>
<text class="text-small" dy="0.75em" dx="-0.25em" width="62" height="14" transform="translate(342.5 80.5)">vectorizer</text>
<text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="62" height="14" transform="translate(342.5 80.5)">vectorizer</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M296.5 88.2h24.7"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M327.2 88.2l-8 4 2-4-2-4z"/>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M416 58h103.3L536 88l-16.8 30H416l16.8-30z"/>
<text class="text-small" dy="0.75em" dx="-0.25em" width="40" height="14" transform="translate(455.5 80.5)">tagger</text>
<text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="40" height="14" transform="translate(455.5 80.5)">tagger</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M519 58h103.3L639 88l-16.8 30H519l16.8-30z"/>
<text class="text-small" dy="0.75em" dx="-0.25em" width="40" height="14" transform="translate(558.5 80.5)">parser</text>
<text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="40" height="14" transform="translate(558.5 80.5)">parser</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M622 58h103.3L742 88l-16.8 30H622l16.8-30z"/>
<text class="text-small" dy="0.75em" dx="-0.25em" width="20" height="14" transform="translate(671.5 80.5)">ner</text>
<text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="20" height="14" transform="translate(671.5 80.5)">ner</text>
</svg>

Before

Width:  |  Height:  |  Size: 2.9 KiB

After

Width:  |  Height:  |  Size: 3.1 KiB

View File

@ -0,0 +1,77 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-10 -10 582 365" width="572" height="355">
<style>
.svg__vocab__text { fill: #1a1e23; font: 18px "Source Sans Pro" }
.svg__vocab__text-large { fill: #fff; font: bold 18px "Source Sans Pro"; text-transform: uppercase }
.svg__vocab__text-box { fill: #fff; font: bold 12px "Source Code Pro" }
.svg__vocab__text-code { fill: #1a1e23; font: bold 12px "Source Code Pro" }
</style>
<rect width="570" height="88" x="1" y="135" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="13.2" ry="13.2"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 164h100v40H444z"/>
<text class="svg__vocab__text" dy="1em" transform="translate(477.5 174.5)" width="31" height="17">3572</text>
<rect width="52" height="20" x="468" y="152" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(471.5 155.5)">Lexeme</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M76 164h100v40H76z"/>
<text class="svg__vocab__text" dy="1em" width="23" height="17" transform="translate(113.5 174.5)">508</text>
<rect width="52" height="20" x="100" y="152" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(103.5 155.5)">Lexeme</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M263 164h100v40H263z"/>
<text class="svg__vocab__text" dy="1em" width="23" height="17" transform="translate(300.5 174.5)">949</text>
<rect width="52" height="20" x="287" y="152" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(290.5 155.5)">Lexeme</text>
<rect width="570" height="88" x="1" y="246" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="13.2" ry="13.2"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 275h100v40H444z"/>
<text class="svg__vocab__text" dy="1em" width="55" height="17" transform="translate(465.5 285.5)">&quot;coffee&quot;</text>
<rect width="52" height="20" x="468" y="263" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="28" height="12" transform="translate(479.5 266.5)">3672</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M76 275h100v40H76z"/>
<text class="svg__vocab__text" dy="1em" width="17" height="17" transform="translate(116.5 285.5)">&quot;I&quot;</text>
<rect width="52" height="20" x="100" y="263" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="22" height="12" transform="translate(114.5 266.5)">508</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M263 275h100v40H263z"/>
<text class="svg__vocab__text" dy="1em" width="41" height="17" transform="translate(291.5 285.5)">&quot;love&quot;</text>
<rect width="52" height="20" x="287" y="263" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="22" height="12" transform="translate(301.5 266.5)">949</text>
<rect width="570" height="110" x="1" y="1" fill="#e1d5e7" stroke="#9673a6" stroke-width="2" rx="16.5" ry="16.5"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M263 60h-78.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M178.2 60l8-4-2 4 2 4z"/>
<rect fill="#E1D5E7" width="50" height="12" transform="translate(202.5 53.5)"/>
<text class="svg__vocab__text-code" dx="0.5em" dy="1em" width="50" height="12" transform="translate(202.5 53.5)">nsubj</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M363 60h72.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M441.8 60l-8 4 2-4-2-4z"/>
<rect fill="#E1D5E7" width="43" height="12" transform="translate(375.5 54.5)"/>
<text class="svg__vocab__text-code" dx="0.5em" dy="1em" width="43" height="12" transform="translate(375.5 54.5)">dobj</text>
<rect width="50" height="88" x="1" y="246" fill="#666" stroke="#666" stroke-width="2" rx="7.5" ry="7.5"/>
<text class="svg__vocab__text-large" dx="-0.25em" dy="1em" width="53" height="36" transform="rotate(-90 162 155)">String</text>
<text class="svg__vocab__text-large" dy="2em" width="53" height="36" transform="rotate(-90 162 155)">Store</text>
<rect width="50" height="88" x="1" y="135" fill="#82b366" stroke="#82b366" stroke-width="2" rx="7.5" ry="7.5"/>
<text class="svg__vocab__text-large" dx="-0.25em" dy="0.9em" width="47" height="17" transform="rotate(-90 109.5 93)">Vocab</text>
<rect width="50" height="110" x="1" y="1" fill="#9673a6" stroke="#9673a6" stroke-width="2" rx="7.5" ry="7.5"/>
<text class="svg__vocab__text-large" dx="-0.25em" dy="0.9em" width="31" height="17" transform="rotate(-90 44 27.5)">Doc</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M263 27h100v66H263z"/>
<text class="svg__vocab__text" dy="1em" width="31" height="33" transform="translate(296.5 42.5)">love</text>
<text class="svg__vocab__text-code" dy="2.8em" width="31" height="33" transform="translate(296.5 42.5)">VERB</text>
<rect width="50" height="20" x="288" y="16" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" transform="translate(294.5 19.5)">Token</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M76 27h100v66H76z"/>
<text class="svg__vocab__text" dx="0.8em" dy="1em" width="29" height="33" transform="translate(110.5 42.5)">I</text>
<text class="svg__vocab__text-code" dy="2.8em" width="29" height="33" transform="translate(110.5 42.5)">PRON</text>
<rect width="50" height="20" x="105" y="17" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="36" height="12" transform="translate(111.5 20.5)">Token</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 27h100v66H444z"/>
<text class="svg__vocab__text" dy="1em" width="45" height="33" transform="translate(470.5 42.5)">coffee</text>
<text class="svg__vocab__text-code" dx="0.6em" dy="2.8em" width="45" height="33" transform="translate(470.5 42.5)">NOUN</text>
<rect width="50" height="20" x="469" y="16" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="36" height="12" transform="translate(475.5 19.5)">Token</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M126 141.8v-38.6"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M126 149.8l-2.7-8h5.4zM126 95.2l2.7 8h-5.4z"/>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M126 214.2v38.6"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M126 206.2l2.7 8h-5.4zM126 260.8l-2.7-8h5.4z"/>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M313 103.2v38.6"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M313 95.2l2.7 8h-5.4zM313 149.8l-2.7-8h5.4z"/>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M313 214.2v38.6"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M313 206.2l2.7 8h-5.4zM313 260.8l-2.7-8h5.4z"/>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M494 214.2v38.6"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M494 206.2l2.7 8h-5.4zM494 260.8l-2.7-8h5.4z"/>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M494 103.2v38.6"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M494 95.2l2.7 8h-5.4zM494 149.8l-2.7-8h5.4z"/>
</svg>

After

Width:  |  Height:  |  Size: 7.6 KiB

View File

@ -24,7 +24,8 @@
"Vocab": "vocab",
"StringStore": "stringstore",
"GoldParse": "goldparse",
"GoldCorpus": "goldcorpus"
"GoldCorpus": "goldcorpus",
"Binder": "binder"
},
"Other": {
"Annotation Specs": "annotation"
@ -47,62 +48,74 @@
"spacy": {
"title": "spaCy top-level functions",
"source": "spacy/__init__.py",
"next": "displacy"
},
"displacy": {
"title": "displaCy",
"tag": "module",
"source": "spacy/displacy",
"next": "util"
},
"util": {
"title": "Utility Functions",
"source": "spacy/util.py",
"next": "cli"
},
"cli": {
"title": "Command Line Interface"
"title": "Command Line Interface",
"source": "spacy/cli"
},
"language": {
"title": "Language",
"tag": "class"
"tag": "class",
"source": "spacy/language.py"
},
"doc": {
"title": "Doc",
"tag": "class"
"tag": "class",
"source": "spacy/tokens/doc.pyx"
},
"token": {
"title": "Token",
"tag": "class"
"tag": "class",
"source": "spacy/tokens/token.pyx"
},
"span": {
"title": "Span",
"tag": "class"
"tag": "class",
"source": "spacy/tokens/span.pyx"
},
"lexeme": {
"title": "Lexeme",
"tag": "class"
"tag": "class",
"source": "spacy/lexeme.pyx"
},
"vocab": {
"title": "Vocab",
"tag": "class"
"tag": "class",
"source": "spacy/vocab.pyx"
},
"stringstore": {
"title": "StringStore",
"tag": "class"
"tag": "class",
"source": "spacy/strings.pyx"
},
"matcher": {
"title": "Matcher",
"tag": "class"
"tag": "class",
"source": "spacy/matcher.pyx"
},
"dependenyparser": {
@ -122,7 +135,8 @@
"tokenizer": {
"title": "Tokenizer",
"tag": "class"
"tag": "class",
"source": "spacy/tokenizer.pyx"
},
"tagger": {
@ -132,11 +146,18 @@
"goldparse": {
"title": "GoldParse",
"tag": "class"
"tag": "class",
"source": "spacy/gold.pyx"
},
"goldcorpus": {
"title": "GoldCorpus",
"tag": "class",
"source": "spacy/gold.pyx"
},
"binder": {
"title": "Binder",
"tag": "class"
},

View File

@ -0,0 +1,5 @@
//- 💫 DOCS > API > BINDER
include ../../_includes/_mixins
+under-construction

View File

@ -166,7 +166,7 @@ p
| #[+a("/docs/api/annotation#json-input") JSON format].
+code(false, "bash").
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--parser-L1] [--no-tagger] [--no-parser] [--no-ner]
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
+table(["Argument", "Type", "Description"])
+row
@ -192,18 +192,13 @@ p
+row
+cell #[code --n-iter], #[code -n]
+cell option
+cell Number of iterations (default: #[code 15]).
+cell Number of iterations (default: #[code 20]).
+row
+cell #[code --n_sents], #[code -ns]
+cell #[code --n-sents], #[code -ns]
+cell option
+cell Number of sentences (default: #[code 0]).
+row
+cell #[code --parser-L1], #[code -L]
+cell option
+cell L1 regularization penalty for parser (default: #[code 0.0]).
+row
+cell #[code --use-gpu], #[code -G]
+cell flag
@ -220,7 +215,7 @@ p
+cell Don't train parser.
+row
+cell #[code --no-ner], #[code -N]
+cell #[code --no-entities], #[code -N]
+cell flag
+cell Don't train NER.
@ -229,6 +224,106 @@ p
+cell flag
+cell Show help message and available arguments.
+h(3, "train-hyperparams") Environment variables for hyperparameters
p
| spaCy lets you set hyperparameters for training via environment variables.
| This is useful, because it keeps the command simple and allows you to
| #[+a("https://askubuntu.com/questions/17536/how-do-i-create-a-permanent-bash-alias/17537#17537") create an alias]
| for your custom #[code train] command while still being able to easily
| tweak the hyperparameters. For example:
+code(false, "bash").
parser_hidden_depth=2 parser_maxout_pieces=1 train-parser
+under-construction
+table(["Name", "Description", "Default"])
+row
+cell #[code dropout_from]
+cell
+cell #[code 0.2]
+row
+cell #[code dropout_to]
+cell
+cell #[code 0.2]
+row
+cell #[code dropout_decay]
+cell
+cell #[code 0.0]
+row
+cell #[code batch_from]
+cell
+cell #[code 1]
+row
+cell #[code batch_to]
+cell
+cell #[code 64]
+row
+cell #[code batch_compound]
+cell
+cell #[code 1.001]
+row
+cell #[code token_vector_width]
+cell
+cell #[code 128]
+row
+cell #[code embed_size]
+cell
+cell #[code 7500]
+row
+cell #[code parser_maxout_pieces]
+cell
+cell #[code 2]
+row
+cell #[code parser_hidden_depth]
+cell
+cell #[code 1]
+row
+cell #[code hidden_width]
+cell
+cell #[code 128]
+row
+cell #[code learn_rate]
+cell
+cell #[code 0.001]
+row
+cell #[code optimizer_B1]
+cell
+cell #[code 0.9]
+row
+cell #[code optimizer_B2]
+cell
+cell #[code 0.999]
+row
+cell #[code optimizer_eps]
+cell
+cell #[code 1e-08]
+row
+cell #[code L2_penalty]
+cell
+cell #[code 1e-06]
+row
+cell #[code grad_norm_clip]
+cell
+cell #[code 1.0]
+h(2, "package") Package
p

View File

@ -10,6 +10,7 @@ p
+h(2, "serve") displacy.serve
+tag method
+tag-new(2)
p
| Serve a dependency parse tree or named entity visualization to view it
@ -71,6 +72,7 @@ p
+h(2, "render") displacy.render
+tag method
+tag-new(2)
p Render a dependency parse tree or named entity visualization.

View File

@ -255,6 +255,7 @@ p
+h(2, "to_disk") Doc.to_disk
+tag method
+tag-new(2)
p Save the current state to a directory.
@ -271,12 +272,14 @@ p Save the current state to a directory.
+h(2, "from_disk") Doc.from_disk
+tag method
+tag-new(2)
p Loads state from a directory. Modifies the object in place and returns it.
+aside-code("Example").
from spacy.tokens import Doc
doc = Doc().from_disk('/path/to/doc')
from spacy.vocab import Vocab
doc = Doc(Vocab()).from_disk('/path/to/doc')
+table(["Name", "Type", "Description"])
+row

View File

@ -8,6 +8,7 @@ p
+h(2, "init") GoldCorpus.__init__
+tag method
+tag-new(2)
p Create a #[code GoldCorpus].

View File

@ -73,15 +73,26 @@ p
+cell The text to be processed.
+row
+cell #[code **disabled]
+cell -
+cell Elements of the pipeline that should not be run.
+cell #[code disable]
+cell list
+cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+footrow
+cell returns
+cell #[code Doc]
+cell A container for accessing the annotations.
+infobox("⚠️ Deprecation note")
.o-block
| Pipeline components to prevent from being loaded can now be added as
| a list to #[code disable], instead of specifying one keyword argument
| per component.
+code-new doc = nlp(u"I don't want parsed", disable=['parser'])
+code-old doc = nlp(u"I don't want parsed", parse=False)
+h(2, "pipe") Language.pipe
+tag method
@ -112,6 +123,13 @@ p
+cell int
+cell The number of texts to buffer.
+row
+cell #[code disable]
+cell list
+cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+footrow
+cell yields
+cell #[code Doc]
@ -227,8 +245,11 @@ p
+h(2, "to_disk") Language.to_disk
+tag method
+tag-new(2)
p Save the current state to a directory.
p
| Save the current state to a directory. If a model is loaded, this will
| #[strong include the model].
+aside-code("Example").
nlp.to_disk('/path/to/models')
@ -242,14 +263,21 @@ p Save the current state to a directory.
| Paths may be either strings or #[code Path]-like objects.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being saved.
+cell #[code disable]
+cell list
+cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable]
| and prevent from being saved.
+h(2, "from_disk") Language.from_disk
+tag method
+tag-new(2)
p Loads state from a directory. Modifies the object in place and returns it.
p
| Loads state from a directory. Modifies the object in place and returns
| it. If the saved #[code Language] object contains a model, the
| #[strong model will be loaded].
+aside-code("Example").
from spacy.language import Language
@ -264,15 +292,28 @@ p Loads state from a directory. Modifies the object in place and returns it.
| #[code Path]-like objects.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+cell #[code disable]
+cell list
+cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+footrow
+cell returns
+cell #[code Language]
+cell The modified #[code Language] object.
+infobox("⚠️ Deprecation note")
.o-block
| As of spaCy v2.0, the #[code save_to_directory] method has been
| renamed to #[code to_disk], to improve consistency across classes.
| Pipeline components to prevent from being loaded can now be added as
| a list to #[code disable], instead of specifying one keyword argument
| per component.
+code-new nlp = English().from_disk(disable=['tagger', 'ner'])
+code-old nlp = spacy.load('en', tagger=False, entity=False)
+h(2, "to_bytes") Language.to_bytes
+tag method
@ -283,9 +324,12 @@ p Serialize the current state to a binary string.
+table(["Name", "Type", "Description"])
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being serialized.
+cell #[code disable]
+cell list
+cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable]
| and prevent from being serialized.
+footrow
+cell returns
@ -310,15 +354,26 @@ p Load state from a binary string.
+cell The data to load from.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+cell #[code disable]
+cell list
+cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+footrow
+cell returns
+cell #[code Language]
+cell The #[code Language] object.
+infobox("⚠️ Deprecation note")
.o-block
| Pipeline components to prevent from being loaded can now be added as
| a list to #[code disable], instead of specifying one keyword argument
| per component.
+code-new nlp = English().from_bytes(bytes, disable=['tagger', 'ner'])
+code-old nlp = English().from_bytes('en', tagger=False, entity=False)
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])
@ -327,6 +382,11 @@ p Load state from a binary string.
+cell #[code Vocab]
+cell A container for the lexical types.
+row
+cell #[code tokenizer]
+cell #[code Tokenizer]
+cell The tokenizer.
+row
+cell #[code make_doc]
+cell #[code lambda text: Doc]

View File

@ -212,62 +212,74 @@ p The L2 norm of the lexeme's vector representation.
+row
+cell #[code is_alpha]
+cell bool
+cell Equivalent to #[code word.orth_.isalpha()].
+cell
| Does the lexeme consist of alphabetic characters? Equivalent to
| #[code lexeme.text.isalpha()].
+row
+cell #[code is_ascii]
+cell bool
+cell Equivalent to #[code [any(ord(c) >= 128 for c in word.orth_)]].
+cell
| Does the lexeme consist of ASCII characters? Equivalent to
| #[code [any(ord(c) >= 128 for c in lexeme.text)]].
+row
+cell #[code is_digit]
+cell bool
+cell Equivalent to #[code word.orth_.isdigit()].
+cell
| Does the lexeme consist of digits? Equivalent to
| #[code lexeme.text.isdigit()].
+row
+cell #[code is_lower]
+cell bool
+cell Equivalent to #[code word.orth_.islower()].
+cell
| Is the lexeme in lowercase? Equivalent to
| #[code lexeme.text.islower()].
+row
+cell #[code is_title]
+cell bool
+cell Equivalent to #[code word.orth_.istitle()].
+cell
| Is the lexeme in titlecase? Equivalent to
| #[code lexeme.text.istitle()].
+row
+cell #[code is_punct]
+cell bool
+cell Equivalent to #[code word.orth_.ispunct()].
+cell Is the lexeme punctuation?
+row
+cell #[code is_space]
+cell bool
+cell Equivalent to #[code word.orth_.isspace()].
+cell
| Does the lexeme consist of whitespace characters? Equivalent to
| #[code lexeme.text.isspace()].
+row
+cell #[code like_url]
+cell bool
+cell Does the word resemble a URL?
+cell Does the lexeme resemble a URL?
+row
+cell #[code like_num]
+cell bool
+cell Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc.
+cell Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc.
+row
+cell #[code like_email]
+cell bool
+cell Does the word resemble an email address?
+cell Does the lexeme resemble an email address?
+row
+cell #[code is_oov]
+cell bool
+cell Is the word out-of-vocabulary?
+cell Is the lexeme out-of-vocabulary?
+row
+cell #[code is_stop]
+cell bool
+cell Is the word part of a "stop list"?
+cell Is the lexeme part of a "stop list"?
+row
+cell #[code lang]

View File

@ -5,6 +5,7 @@ include ../../_includes/_mixins
p Match sequences of tokens, based on pattern rules.
+infobox("⚠️ Deprecation note")
.o-block
| As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
| are deprecated and have been replaced with a simpler
| #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
@ -56,17 +57,6 @@ p Find all token sequences matching the supplied patterns on the #[code Doc].
doc = nlp(u'hello world!')
matches = matcher(doc)
+infobox("Important note")
| By default, the matcher #[strong does not perform any action] on matches,
| like tagging matched phrases with entity types. Instead, actions need to
| be specified when #[strong adding patterns or entities], by
| passing in a callback function as the #[code on_match] argument on
| #[+api("matcher#add") #[code add]]. This allows you to define custom
| actions per pattern within the same matcher. For example, you might only
| want to merge some entity types, and set custom flags for other matched
| patterns. For more details and examples, see the usage workflow on
| #[+a("/docs/usage/rule-based-matching") rule-based matching].
+table(["Name", "Type", "Description"])
+row
+cell #[code doc]
@ -81,6 +71,17 @@ p Find all token sequences matching the supplied patterns on the #[code Doc].
| matches. A match tuple describes a span #[code doc[start:end]].
| The #[code match_id] is the ID of the added match pattern.
+infobox("Important note")
| By default, the matcher #[strong does not perform any action] on matches,
| like tagging matched phrases with entity types. Instead, actions need to
| be specified when #[strong adding patterns or entities], by
| passing in a callback function as the #[code on_match] argument on
| #[+api("matcher#add") #[code add]]. This allows you to define custom
| actions per pattern within the same matcher. For example, you might only
| want to merge some entity types, and set custom flags for other matched
| patterns. For more details and examples, see the usage workflow on
| #[+a("/docs/usage/rule-based-matching") rule-based matching].
+h(2, "pipe") Matcher.pipe
+tag method
@ -118,6 +119,7 @@ p Match a stream of documents, yielding them in turn.
+h(2, "len") Matcher.__len__
+tag method
+tag-new(2)
p
| Get the number of rules added to the matcher. Note that this only returns
@ -138,6 +140,7 @@ p
+h(2, "contains") Matcher.__contains__
+tag method
+tag-new(2)
p Check whether the matcher contains rules for a match ID.
@ -159,6 +162,7 @@ p Check whether the matcher contains rules for a match ID.
+h(2, "add") Matcher.add
+tag method
+tag-new(2)
p
| Add a rule to the matcher, consisting of an ID key, one or more patterns, and
@ -198,8 +202,23 @@ p
| Match pattern. A pattern consists of a list of dicts, where each
| dict describes a token.
+infobox("⚠️ Deprecation note")
.o-block
| As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
| are deprecated and have been replaced with a simpler
| #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
| patterns and a callback for a given match ID.
+code-new.
matcher.add('GoogleNow', merge_phrases, [{ORTH: 'Google'}, {ORTH: 'Now'}])
+code-old.
matcher.add_entity('GoogleNow', on_match=merge_phrases)
matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
+h(2, "remove") Matcher.remove
+tag method
+tag-new(2)
p
| Remove a rule from the matcher. A #[code KeyError] is raised if the match
@ -219,6 +238,7 @@ p
+h(2, "get") Matcher.get
+tag method
+tag-new(2)
p
| Retrieve the pattern stored for a key. Returns the rule as an

View File

@ -20,12 +20,7 @@ p
nlp = spacy.load('/path/to/en') # unicode path
nlp = spacy.load(Path('/path/to/en')) # pathlib Path
+infobox("⚠️ Deprecation note")
| As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
| will also raise an error if no model could be loaded and never just
| return an empty #[code Language] object. If you need a blank language,
| you need to import it explicitly (#[code from spacy.lang.en import English])
| or use #[+api("util#get_lang_class") #[code util.get_lang_class]].
nlp = spacy.load('en', disable['parser', 'tagger'])
+table(["Name", "Type", "Description"])
+row
@ -34,15 +29,28 @@ p
+cell Model to load, i.e. shortcut link, package name or path.
+row
+cell #[code **overrides]
+cell -
+cell Override or disable components.
+cell #[code disable]
+cell list
+cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+footrow
+cell returns
+cell #[code Language]
+cell A #[code Language] object with the loaded model.
+infobox("⚠️ Deprecation note")
.o-block
| As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
| will also raise an error if no model could be loaded and never just
| return an empty #[code Language] object. If you need a blank language,
| you need to import it explicitly (#[code from spacy.lang.en import English])
| or use #[+api("util#get_lang_class") #[code util.get_lang_class]].
+code-new nlp = spacy.load('/model')
+code-old nlp = spacy.load('en', path='/model')
+h(2, "info") spacy.info
+tag function
@ -98,3 +106,37 @@ p
+cell returns
+cell unicode
+cell The explanation, or #[code None] if not found in the glossary.
+h(2, "set_factory") spacy.set_factory
+tag function
+tag-new(2)
p
| Set a factory that returns a custom
| #[+a("/docs/usage/language-processing-pipeline") processing pipeline]
| component. Factories are useful for creating stateful components, especially ones which depend on shared data.
+aside-code("Example").
def my_factory(vocab):
def my_component(doc):
return doc
return my_component
spacy.set_factory('my_factory', my_factory)
nlp = Language(pipeline=['my_factory'])
+table(["Name", "Type", "Description"])
+row
+cell #[code factory_id]
+cell unicode
+cell
| Unique name of factory. If added to a new pipeline, spaCy will
| look up the factory for this ID and use it to create the
| component.
+row
+cell #[code factory]
+cell callable
+cell
| Callable that takes a #[code Vocab] object and returns a pipeline
| component.

View File

@ -104,6 +104,7 @@ p
+h(2, "to_disk") StringStore.to_disk
+tag method
+tag-new(2)
p Save the current state to a directory.
@ -118,8 +119,9 @@ p Save the current state to a directory.
| A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects.
+h(2, "from_disk") Tokenizer.from_disk
+h(2, "from_disk") StringStore.from_disk
+tag method
+tag-new(2)
p Loads state from a directory. Modifies the object in place and returns it.
@ -137,10 +139,10 @@ p Loads state from a directory. Modifies the object in place and returns it.
+footrow
+cell returns
+cell #[code Tokenizer]
+cell The modified #[code Tokenizer] object.
+cell #[code StringStore]
+cell The modified #[code StringStore] object.
+h(2, "to_bytes") Tokenizer.to_bytes
+h(2, "to_bytes") StringStore.to_bytes
+tag method
p Serialize the current state to a binary string.
@ -157,9 +159,9 @@ p Serialize the current state to a binary string.
+footrow
+cell returns
+cell bytes
+cell The serialized form of the #[code Tokenizer] object.
+cell The serialized form of the #[code StringStore] object.
+h(2, "from_bytes") Tokenizer.from_bytes
+h(2, "from_bytes") StringStore.from_bytes
+tag method
p Load state from a binary string.

View File

@ -370,116 +370,131 @@ p The L2 norm of the token's vector representation.
+cell #[code lemma]
+cell int
+cell
| Base form of the word, with no inflectional suffixes.
| Base form of the token, with no inflectional suffixes.
+row
+cell #[code lemma_]
+cell unicode
+cell Base form of the word, with no inflectional suffixes.
+cell Base form of the token, with no inflectional suffixes.
+row
+cell #[code lower]
+cell int
+cell Lower-case form of the word.
+cell Lower-case form of the token.
+row
+cell #[code lower_]
+cell unicode
+cell Lower-case form of the word.
+cell Lower-case form of the token.
+row
+cell #[code shape]
+cell int
+cell Transform of the word's string, to show orthographic features.
+cell
| Transform of the tokens's string, to show orthographic features.
| For example, "Xxxx" or "dd".
+row
+cell #[code shape_]
+cell unicode
+cell A transform of the word's string, to show orthographic features.
| Transform of the tokens's string, to show orthographic features.
| For example, "Xxxx" or "dd".
+row
+cell #[code prefix]
+cell int
+cell Integer ID of a length-N substring from the start of the
| word. Defaults to #[code N=1].
| token. Defaults to #[code N=1].
+row
+cell #[code prefix_]
+cell unicode
+cell
| A length-N substring from the start of the word. Defaults to
| A length-N substring from the start of the token. Defaults to
| #[code N=1].
+row
+cell #[code suffix]
+cell int
+cell
| Length-N substring from the end of the word. Defaults to #[code N=3].
| Length-N substring from the end of the token. Defaults to #[code N=3].
+row
+cell #[code suffix_]
+cell unicode
+cell Length-N substring from the end of the word. Defaults to #[code N=3].
+cell Length-N substring from the end of the token. Defaults to #[code N=3].
+row
+cell #[code is_alpha]
+cell bool
+cell Equivalent to #[code word.orth_.isalpha()].
+cell
| Does the token consist of alphabetic characters? Equivalent to
| #[code token.text.isalpha()].
+row
+cell #[code is_ascii]
+cell bool
+cell Equivalent to #[code [any(ord(c) >= 128 for c in word.orth_)]].
+cell
| Does the token consist of ASCII characters? Equivalent to
| #[code [any(ord(c) >= 128 for c in token.text)]].
+row
+cell #[code is_digit]
+cell bool
+cell Equivalent to #[code word.orth_.isdigit()].
+cell
| Does the token consist of digits? Equivalent to
| #[code token.text.isdigit()].
+row
+cell #[code is_lower]
+cell bool
+cell Equivalent to #[code word.orth_.islower()].
+cell
| Is the token in lowercase? Equivalent to
| #[code token.text.islower()].
+row
+cell #[code is_title]
+cell bool
+cell Equivalent to #[code word.orth_.istitle()].
+cell
| Is the token in titlecase? Equivalent to
| #[code token.text.istitle()].
+row
+cell #[code is_punct]
+cell bool
+cell Equivalent to #[code word.orth_.ispunct()].
+cell Is the token punctuation?
+row
+cell #[code is_space]
+cell bool
+cell Equivalent to #[code word.orth_.isspace()].
+cell
| Does the token consist of whitespace characters? Equivalent to
| #[code token.text.isspace()].
+row
+cell #[code like_url]
+cell bool
+cell Does the word resemble a URL?
+cell Does the token resemble a URL?
+row
+cell #[code like_num]
+cell bool
+cell Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc.
+cell Does the token represent a number? e.g. "10.9", "10", "ten", etc.
+row
+cell #[code like_email]
+cell bool
+cell Does the word resemble an email address?
+cell Does the token resemble an email address?
+row
+cell #[code is_oov]
+cell bool
+cell Is the word out-of-vocabulary?
+cell Is the token out-of-vocabulary?
+row
+cell #[code is_stop]
+cell bool
+cell Is the word part of a "stop list"?
+cell Is the token part of a "stop list"?
+row
+cell #[code pos]

View File

@ -198,91 +198,6 @@ p
| attributes. The #[code ORTH] fields of the attributes must
| exactly match the string when they are concatenated.
+h(2, "to_disk") Tokenizer.to_disk
+tag method
p Save the current state to a directory.
+aside-code("Example").
tokenizer.to_disk('/path/to/tokenizer')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects.
+h(2, "from_disk") Tokenizer.from_disk
+tag method
p Loads state from a directory. Modifies the object in place and returns it.
+aside-code("Example").
from spacy.tokenizer import Tokenizer
tokenizer = Tokenizer(nlp.vocab)
tokenizer = tokenizer.from_disk('/path/to/tokenizer')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory. Paths may be either strings or
| #[code Path]-like objects.
+footrow
+cell returns
+cell #[code Tokenizer]
+cell The modified #[code Tokenizer] object.
+h(2, "to_bytes") Tokenizer.to_bytes
+tag method
p Serialize the current state to a binary string.
+aside-code("Example").
tokenizer_bytes = tokenizer.to_bytes()
+table(["Name", "Type", "Description"])
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being serialized.
+footrow
+cell returns
+cell bytes
+cell The serialized form of the #[code Tokenizer] object.
+h(2, "from_bytes") Tokenizer.from_bytes
+tag method
p Load state from a binary string.
+aside-code("Example").
fron spacy.tokenizer import Tokenizer
tokenizer_bytes = tokenizer.to_bytes()
new_tokenizer = Tokenizer(nlp.vocab)
new_tokenizer.from_bytes(tokenizer_bytes)
+table(["Name", "Type", "Description"])
+row
+cell #[code bytes_data]
+cell bytes
+cell The data to load from.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+footrow
+cell returns
+cell #[code Tokenizer]
+cell The #[code Tokenizer] object.
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])

View File

@ -76,6 +76,7 @@ p
+h(2, "resolve_model_path") util.resolve_model_path
+tag function
+tag-new(2)
p Resolve a model name or string to a model path.
@ -169,6 +170,7 @@ p
+h(2, "is_in_jupyter") util.is_in_jupyter
+tag function
+tag-new(2)
p
| Check if user is running spaCy from a #[+a("https://jupyter.org") Jupyter]
@ -221,6 +223,7 @@ p
+h(2, "prints") util.prints
+tag function
+tag-new(2)
p
| Print a formatted, text-wrapped message with optional title. If a text

View File

@ -159,6 +159,7 @@ p
+h(2, "to_disk") Vocab.to_disk
+tag method
+tag-new(2)
p Save the current state to a directory.
@ -175,6 +176,7 @@ p Save the current state to a directory.
+h(2, "from_disk") Vocab.from_disk
+tag method
+tag-new(2)
p Loads state from a directory. Modifies the object in place and returns it.

View File

@ -80,7 +80,7 @@
},
"customizing-tokenizer": {
"title": "Customizing the tokenizer",
"title": "Customising the tokenizer",
"next": "rule-based-matching"
},

View File

@ -48,3 +48,13 @@ p
+cell ner
+cell #[+api("entityrecognizer") #[code EntityRecognizer]]
+cell #[code Doc.ents], #[code Doc[i].ent_iob], #[code Doc[i].ent_type]
p
| The processing pipeline always #[strong depends on the statistical model]
| and its capabilities. For example, a pipeline can only include an entity
| recognizer component if the model includes data to make predictions of
| entity labels. This is why each model will specify the pipeline to use
| in its meta data, as a simple list containing the component names:
+code(false, "json").
"pipeline": ["vectorizer", "tagger", "parser", "ner"]

View File

@ -22,10 +22,10 @@ p
| untrusted sources.
p
| All container classes and pipeline components, i.e.
for cls in ["Doc", "Language", "Tokenizer", "Tagger", "DependencyParser", "EntityRecognizer", "Vocab", "StringStore"]
| #[+api(cls.toLowerCase()) #[code=cls]],
| have the following methods available:
| All container classes, i.e. #[+api("language") #[code Language]],
| #[+api("doc") #[code Doc]], #[+api("vocab") #[code Vocab]] and
| #[+api("stringstore") #[code StringStore]] have the following methods
| available:
+table(["Method", "Returns", "Example"])
- style = [1, 0, 1]
@ -34,7 +34,35 @@ p
+annotation-row(["to_disk", "-", "nlp.to_disk('/path')"], style)
+annotation-row(["from_disk", "object", "nlp.from_disk('/path')"], style)
p
| For example, if you've processed a very large document, you can use
| #[+api("doc#to_disk") #[code Doc.to_disk]] to save it to a file on your
| local machine. This will save the document and its tokens, as well as
| the vocabulary associated with the #[code Doc].
+aside("Why saving the vocab?")
| Saving the vocabulary with the #[code Doc] is important, because the
| #[code Vocab] holds the context-independent information about the words,
| tags and labels, and their #[strong integer IDs]. If the #[code Vocab]
| wasn't saved with the #[code Doc], spaCy wouldn't know how to resolve
| those IDs for example, the word text or the dependency labels. You
| might be saving #[code 446] for "whale", but in a different vocabulary,
| this ID could map to "VERB". Similarly, if your document was processed by
| a German model, its vocab will include the specific
| #[+a("/docs/api/annotation#dependency-parsing-german") German dependency labels].
+code.
moby_dick = open('moby_dick.txt', 'r') # open a large document
doc = nlp(moby_dick) # process it
doc.to_disk('/moby_dick.bin') # save the processed Doc
p
| If you need it again later, you can load it back into an empty #[code Doc]
| with an empty #[code Vocab] by calling
| #[+api("doc#from_disk") #[code from_disk()]]:
+code.
from spacy.tokens import Doc # to create empty Doc
from spacy.vocab import Vocab # to create empty Vocab
doc = Doc(Vocab()).from_disk('/moby_dick.bin') # load processed Doc

View File

@ -1,3 +1,3 @@
//- 💫 DOCS > USAGE > SPACY 101 > TRAINING
p
+under-construction

View File

@ -0,0 +1,92 @@
//- 💫 DOCS > USAGE > SPACY 101 > VOCAB & STRINGSTORE
p
| Whenever possible, spaCy tries to store data in a vocabulary, the
| #[+api("vocab") #[code Vocab]], that will be
| #[strong shared by multiple documents]. To save memory, spaCy also
| encodes all strings to #[strong integer IDs] in this case for example,
| "coffee" has the ID #[code 3672]. Entity labels like "ORG" and
| part-of-speech tags like "VERB" are also encoded. Internally, spaCy
| only "speaks" in integer IDs.
+aside
| #[strong Token]: A word, punctuation mark etc. #[em in context], including
| its attributes, tags and dependencies.#[br]
| #[strong Lexeme]: A "word type" with no context. Includes the word shape
| and flags, e.g. if it's lowercase, a digit or punctuation.#[br]
| #[strong Doc]: A processed container of tokens in context.#[br]
| #[strong Vocab]: The collection of lexemes.#[br]
| #[strong StringStore]: The dictionary mapping integer IDs to strings, for
| example #[code 3672] &rarr; "coffee".
+image
include ../../../assets/img/docs/vocab_stringstore.svg
.u-text-right
+button("/assets/img/docs/vocab_stringstore.svg", false, "secondary").u-text-tag View large graphic
p
| If you process lots of documents containing the word "coffee" in all
| kinds of different contexts, storing the exact string "coffee" every time
| would take up way too much space. So instead, spaCy assigns it an ID
| and stores it in the #[+api("stringstore") #[code StringStore]]. You can
| think of the #[code StringStore] as a
| #[strong lookup table that works in both directions] you can look up a
| string to get its ID, or an ID to get its string:
+code.
doc = nlp(u'I like coffee')
assert doc.vocab.strings[u'coffee'] == 3572
assert doc.vocab.strings[3572] == u'coffee'
p
| Now that all strings are encoded, the entries in the vocabulary
| #[strong don&apos;t need to include the word text] themselves. Instead,
| they can look it up in the #[code StringStore] via its integer ID. Each
| entry in the vocabulary, also called #[+api("lexeme") #[code Lexeme]],
| contains the #[strong context-independent] information about a word.
| For example, no matter if "love" is used as a verb or a noun in some
| context, its spelling and whether it consists of alphabetic characters
| won't ever change.
+code.
for word in doc:
lexeme = doc.vocab[word.text]
print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)
+aside
| #[strong Text]: The original text of the lexeme.#[br]
| #[strong Orth]: The integer ID of the lexeme.#[br]
| #[strong Shape]: The abstract word shape of the lexeme.#[br]
| #[strong Prefix]: By default, the first letter of the word string.#[br]
| #[strong Suffix]: By default, the last three letters of the word string.#[br]
| #[strong is alpha]: Does the lexeme consist of alphabetic characters?#[br]
| #[strong is digit]: Does the lexeme consist of digits?#[br]
| #[strong is title]: Does the lexeme consist of alphabetic characters?#[br]
| #[strong Lang]: The language of the parent vocabulary.
+table(["text", "orth", "shape", "prefix", "suffix", "is_alpha", "is_digit", "is_title", "lang"])
- var style = [0, 1, 1, 0, 0, 1, 1, 1, 0]
+annotation-row(["I", 508, "X", "I", "I", true, false, true, "en"], style)
+annotation-row(["love", 949, "xxxx", "l", "ove", true, false, false, "en"], style)
+annotation-row(["coffee", 3572, "xxxx", "c", "ffe", true, false, false, "en"], style)
p
| The specific entries in the voabulary and their IDs don't really matter
| #[strong as long as they match]. That's why you always need to make sure
| all objects you create have access to the same vocabulary. If they don't,
| the IDs won't match and spaCy will either produce very confusing results,
| or fail alltogether.
+code.
from spacy.tokens import Doc
from spacy.vocab import Vocab
doc = nlp(u'I like coffee') # original Doc
new_doc = Doc(Vocab(), words=['I', 'like', 'coffee']) # new Doc with empty Vocab
assert doc.vocab.strings[u'coffee'] == 3572 # ID in vocab of Doc
assert new_doc.vocab.strings[u'coffee'] == 446 # ID in vocab of new Doc
p
| Even though both #[code Doc] objects contain the same words, the internal
| integer IDs are very different.

View File

@ -107,7 +107,6 @@ p
.u-text-right
+button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic
+table(["File name", "Variables", "Description"])
+row
+cell #[+src(gh("spacy-dev-resources", "templates/new_language/stop_words.py")) stop_words.py]
@ -439,7 +438,7 @@ p
+h(3, "morph-rules") Morph rules
//- TODO: write morph rules section
+under-construction
+h(2, "testing") Testing the new language tokenizer
@ -631,7 +630,7 @@ p
| trains the model using #[+a("https://radimrehurek.com/gensim/") Gensim].
| The #[code vectors.bin] file should consist of one word and vector per line.
+aside-code("your_data_directory", "yaml").
//-+aside-code("your_data_directory", "yaml").
├── vocab/
| ├── lexemes.bin
| ├── strings.json
@ -662,4 +661,4 @@ p
| model use the using spaCy's #[+api("cli#train") #[code train]] command:
+code(false, "bash").
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner]
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]

View File

@ -17,6 +17,8 @@ p
| #[+a("http://deeplearning.net/software/theano/") Theano] is also
| supported.
+under-construction
+code("Runtime usage").
def count_entity_sentiment(nlp, texts):
'''Compute the net document sentiment for each entity in the texts.'''
@ -153,7 +155,9 @@ p
| adding another LSTM layer, using attention mechanism, using character
| features, etc.
+h(2, "attribute-hooks") Attribute hooks (experimental)
+h(2, "attribute-hooks") Attribute hooks
+under-construction
p
| Earlier, we saw how to store data in the new generic #[code user_data]

View File

@ -322,8 +322,9 @@ p
| If you don't need a particular component of the pipeline for
| example, the tagger or the parser, you can disable loading it. This can
| sometimes make a big difference and improve loading speed. Disabled
| component names can be provided to #[code spacy.load], #[code from_disk]
| or the #[code nlp] object itself as a list:
| component names can be provided to #[+api("spacy#load") #[code spacy.load]],
| #[+api("language#from_disk") #[code Language.from_disk]] or the
| #[code nlp] object itself as a list:
+code.
nlp = spacy.load('en', disable['parser', 'tagger'])

View File

@ -35,7 +35,7 @@ p
assert doc[0].text == u'Peach'
assert doc[1].text == u'emoji'
assert doc[-1].text == u'🍑'
assert doc[17:19] == u'outranking eggplant'
assert doc[17:19].text == u'outranking eggplant'
assert doc.noun_chunks[0].text == u'Peach emoji'
sentences = list(doc.sents)

View File

@ -2,16 +2,18 @@
include ../../_includes/_mixins
+under-construction
+h(2, "multithreading") Multi-threading with #[code .pipe()]
p
| If you have a sequence of documents to process, you should use the
| #[+api("language#pipe") #[code .pipe()]] method. The method takes an
| iterator of texts, and accumulates an internal buffer,
| #[+api("language#pipe") #[code Language.pipe()]] method. The method takes
| an iterator of texts, and accumulates an internal buffer,
| which it works on in parallel. It then yields the documents in order,
| one-by-one. After a long and bitter struggle, the global interpreter
| lock was freed around spaCy's main parsing loop in v0.100.3. This means
| that the #[code .pipe()] method will be significantly faster in most
| that #[code .pipe()] will be significantly faster in most
| practical situations, because it allows shared memory parallelism.
+code.
@ -20,23 +22,27 @@ p
p
| To make full use of the #[code .pipe()] function, you might want to
| brush up on Python generators. Here are a few quick hints:
| brush up on #[strong Python generators]. Here are a few quick hints:
+list
+item
| Generator comprehensions can be written
| (#[code item for item in sequence])
| Generator comprehensions can be written as
| #[code (item for item in sequence)].
+item
| The #[code itertools] built-in library and the #[code cytoolz]
| package provide a lot of handy generator tools
| The
| #[+a("https://docs.python.org/2/library/itertools.html") #[code itertools] built-in library]
| and the
| #[+a("https://github.com/pytoolz/cytoolz") #[code cytoolz] package]
| provide a lot of handy #[strong generator tools].
+item
| Often you'll have an input stream that pairs text with some
| important metadata, e.g. a JSON document. To pair up the metadata
| with the processed #[code Doc] object, you should use the tee
| function to split the generator in two, and then #[code izip] the
| extra stream to the document stream.
| important meta data, e.g. a JSON document. To
| #[strong pair up the meta data] with the processed #[code Doc]
| object, you should use the #[code itertools.tee] function to split
| the generator in two, and then #[code izip] the extra stream to the
| document stream.
+h(2, "own-annotations") Bringing your own annotations

View File

@ -4,6 +4,8 @@ include ../../_includes/_mixins
+h(2, "features") Features
+under-construction
+aside
| If one of spaCy's functionalities #[strong needs a model], it means that
| you need to have one our the available
@ -91,17 +93,35 @@ p
include _spacy-101/_tokenization
+infobox
| To learn more about how spaCy's tokenizer and its rules work in detail,
| how to #[strong customise] it and how to #[strong add your own tokenizer]
| to a processing pipeline, see the usage guide on
| #[+a("/docs/usage/customizing-tokenizer") customising the tokenizer].
+h(3, "annotations-pos-deps") Part-of-speech tags and dependencies
+tag-model("dependency parse")
include _spacy-101/_pos-deps
+infobox
| To learn more about #[strong part-of-speech tagging] and rule-based
| morphology, and how to #[strong navigate and use the parse tree]
| effectively, see the usage guides on
| #[+a("/docs/usage/pos-tagging") part-of-speech tagging] and
| #[+a("/docs/usage/dependency-parse") using the dependency parse].
+h(3, "annotations-ner") Named Entities
+tag-model("named entities")
include _spacy-101/_named-entities
+infobox
| To learn more about entity recognition in spaCy, how to
| #[strong add your own entities] to a document and how to train and update
| the entity predictions of a model, see the usage guide on
| #[+a("/docs/usage/entity-recognition") named entity recognition].
+h(2, "vectors-similarity") Word vectors and similarity
+tag-model("vectors")
@ -109,20 +129,43 @@ include _spacy-101/_similarity
include _spacy-101/_word-vectors
+infobox
| To learn more about word vectors, how to #[strong customise them] and
| how to load #[strong your own vectors] into spaCy, see the usage
| guide on
| #[+a("/docs/usage/word-vectors-similarities") using word vectors and semantic similarities].
+h(2, "pipelines") Pipelines
include _spacy-101/_pipelines
+infobox
| To learn more about #[strong how processing pipelines work] in detail,
| how to enable and disable their components, and how to
| #[strong create your own], see the usage guide on
| #[+a("/docs/usage/language-processing-pipeline") language processing pipelines].
+h(2, "vocab-stringstore") Vocab, lexemes and the string store
include _spacy-101/_vocab-stringstore
+h(2, "serialization") Serialization
include _spacy-101/_serialization
+infobox
| To learn more about #[strong serialization] and how to
| #[strong save and load your own models], see the usage guide on
| #[+a("/docs/usage/saving-loading") saving, loading and data serialization].
+h(2, "training") Training
include _spacy-101/_training
+h(2, "architecture") Architecture
+under-construction
+image
include ../../assets/img/docs/architecture.svg
.u-text-right

View File

@ -64,44 +64,10 @@ p
| predicts the new category with minimal difference from the previous
| output.
+h(2, "saving-loading") Saving and loading
p
| After training our model, you'll usually want to save its state, and load
| it back later. You can do this with the #[code Language.save_to_directory()]
| method:
+code.
nlp.save_to_directory('/home/me/data/en_technology')
p
| To make the model more convenient to deploy, we recommend wrapping it as
| a Python package, so that you can install it via pip and load it as a
| module. spaCy comes with a handy #[+api("cli#package") #[code package]]
| CLI command to create all required files and directories.
+code(false, "bash").
python -m spacy package /home/me/data/en_technology /home/me/my_models
p
| To build the package and create a #[code .tar.gz] archive, run
| #[code python setup.py sdist] from within its directory.
+infobox("Saving and loading models")
| For more information and a detailed guide on how to package your model,
| see the documentation on
| #[+a("/docs/usage/saving-loading") saving and loading models].
p
| After you've generated and installed the package, you'll be able to
| load the model as follows:
+code.
import en_technology
nlp = en_technology.load()
+h(2, "example") Example: Adding and training an #[code ANIMAL] entity
+under-construction
p
| This script shows how to add a new entity type to an existing pre-trained
| NER model. To keep the example short and simple, only four sentences are
@ -170,5 +136,33 @@ p
p
| After training your model, you can
| #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend wrapping
| models as Python packages, for ease of deployment.
| #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend
| wrapping models as Python packages, for ease of deployment.
+h(2, "saving-loading") Saving and loading
p
| After training our model, you'll usually want to save its state, and load
| it back later. You can do this with the
| #[+api("language#to_disk") #[code Language.to_disk()]] method:
+code.
nlp.to_disk('/home/me/data/en_technology')
p
| To make the model more convenient to deploy, we recommend wrapping it as
| a Python package, so that you can install it via pip and load it as a
| module. spaCy comes with a handy #[+api("cli#package") #[code package]]
| CLI command to create all required files and directories.
+code(false, "bash").
python -m spacy package /home/me/data/en_technology /home/me/my_models
p
| To build the package and create a #[code .tar.gz] archive, run
| #[code python setup.py sdist] from within its directory.
+infobox("Saving and loading models")
| For more information and a detailed guide on how to package your model,
| see the documentation on
| #[+a("/docs/usage/saving-loading#models") saving and loading models].

View File

@ -81,59 +81,3 @@ p.o-inline-list
p
+button(gh("spaCy", "examples/training/train_parser.py"), false, "secondary") Full example
+h(2, "feature-templates") Customizing the feature extraction
p
| spaCy currently uses linear models for the tagger, parser and entity
| recognizer, with weights learned using the
| #[+a("https://explosion.ai/blog/part-of-speech-pos-tagger-in-python") Averaged Perceptron algorithm].
+aside("Linear Model Feature Scheme")
| For a list of the available feature atoms, see the #[+a("/docs/api/features") Linear Model Feature Scheme].
p
| Because it's a linear model, it's important for accuracy to build
| conjunction features out of the atomic predictors. Let's say you have
| two atomic predictors asking, "What is the part-of-speech of the
| previous token?", and "What is the part-of-speech of the previous
| previous token?". These predictors will introduce a number of features,
| e.g. #[code Prev-pos=NN], #[code Prev-pos=VBZ], etc. A conjunction
| template introduces features such as #[code Prev-pos=NN&Prev-pos=VBZ].
p
| The feature extraction proceeds in two passes. In the first pass, we
| fill an array with the values of all of the atomic predictors. In the
| second pass, we iterate over the feature templates, and fill a small
| temporary array with the predictors that will be combined into a
| conjunction feature. Finally, we hash this array into a 64-bit integer,
| using the MurmurHash algorithm. You can see this at work in the
| #[+a(gh("thinc", "thinc/linear/features.pyx", "94dbe06fd3c8f24d86ab0f5c7984e52dbfcdc6cb")) #[code thinc.linear.features]] module.
p
| It's very easy to change the feature templates, to create novel
| combinations of the existing atomic predictors. There's currently no API
| available to add new atomic predictors, though. You'll have to create a
| subclass of the model, and write your own #[code set_featuresC] method.
p
| The feature templates are passed in using the #[code features] keyword
| argument to the constructors of the #[+api("tagger") #[code Tagger]],
| #[+api("dependencyparser") #[code DependencyParser]] and
| #[+api("entityrecognizer") #[code EntityRecognizer]]:
+code.
from spacy.vocab import Vocab
from spacy.pipeline import Tagger
from spacy.tagger import P2_orth, P1_orth
from spacy.tagger import P2_cluster, P1_cluster, W_orth, N1_orth, N2_orth
vocab = Vocab(tag_map={'N': {'pos': 'NOUN'}, 'V': {'pos': 'VERB'}})
tagger = Tagger(vocab, features=[(P2_orth, P2_cluster), (P1_orth, P1_cluster),
(P2_orth,), (P1_orth,), (W_orth,),
(N1_orth,), (N2_orth,)])
p
| Custom feature templates can be passed to the #[code DependencyParser]
| and #[code EntityRecognizer] as well, also using the #[code features]
| keyword argument of the constructor.

View File

@ -50,9 +50,10 @@ p
p
| spay's serialization API has been made consistent across classes and
| objects. All container classes and pipeline components now have a
| #[code to_bytes()], #[code from_bytes()], #[code to_disk()] and
| #[code from_disk()] method that supports the Pickle protocol.
| objects. All container classes, i.e. #[code Language], #[code Doc],
| #[code Vocab] and #[code StringStore] now have a #[code to_bytes()],
| #[code from_bytes()], #[code to_disk()] and #[code from_disk()] method
| that supports the Pickle protocol.
p
| The improved #[code spacy.load] makes loading models easier and more

View File

@ -334,7 +334,7 @@ p
| token #[code &lt;script src="malicious-code.js"&gt;&lt;script&gt;].
| Instead of relying on the server to render and sanitize HTML, you
| can do this on the client in JavaScript. displaCy.js creates
| the SVG markup as DOM nodes and will never insert raw HTML.
| the markup as DOM nodes and will never insert raw HTML.
p
| The #[code parse_deps] function takes a #[code Doc] object and returns

View File

@ -23,41 +23,20 @@ p
include _spacy-101/_similarity
include _spacy-101/_word-vectors
+h(2, "custom") Customising word vectors
+under-construction
p
| By default, #[+api("token#vector") #[code Token.vector]] returns the
| vector for its underlying #[+api("lexeme") #[code Lexeme]], while
| #[+api("doc#vector") #[code Doc.vector]] and
| #[+api("span#vector") #[code Span.vector]] return an average of the
| vectors of their tokens.
p
| You can customize these
| vectors of their tokens. You can customize these
| behaviours by modifying the #[code doc.user_hooks],
| #[code doc.user_span_hooks] and #[code doc.user_token_hooks]
| dictionaries.
+code("Example").
# TODO
p
| You can load new word vectors from a file-like buffer using the
| #[code vocab.load_vectors()] method. The file should be a
| whitespace-delimited text file, where the word is in the first column,
| and subsequent columns provide the vector data. For faster loading, you
| can use the #[code vocab.vectors_from_bin_loc()] method, which accepts a
| path to a binary file written by #[code vocab.dump_vectors()].
+code("Example").
# TODO
p
| You can also load vectors from memory by writing to the
| #[+api("lexeme#vector") #[code Lexeme.vector]] property. If the vectors
| you are writing are of different dimensionality
| from the ones currently loaded, you should first call
| #[code vocab.resize_vectors(new_size)].
+h(2, "similarity") Similarity
+under-construction