Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-05-25 17:16:10 -05:00
commit faff1c23fb
52 changed files with 2409 additions and 1700 deletions

View File

@ -83,6 +83,7 @@ cpdef enum attr_id_t:
ENT_IOB
ENT_TYPE
HEAD
SENT_START
SPACY
PROB

View File

@ -85,6 +85,7 @@ IDS = {
"ENT_IOB": ENT_IOB,
"ENT_TYPE": ENT_TYPE,
"HEAD": HEAD,
"SENT_START": SENT_START,
"SPACY": SPACY,
"PROB": PROB,
"LANG": LANG,

View File

@ -82,6 +82,7 @@ cpdef enum symbol_t:
ENT_IOB
ENT_TYPE
HEAD
SENT_START
SPACY
PROB

View File

@ -84,6 +84,7 @@ IDS = {
"ENT_IOB": ENT_IOB,
"ENT_TYPE": ENT_TYPE,
"HEAD": HEAD,
"SENT_START": SENT_START,
"SPACY": SPACY,
"PROB": PROB,

View File

@ -155,3 +155,15 @@ def test_doc_token_api_head_setter(en_tokenizer):
assert doc[3].left_edge.i == 0
assert doc[4].left_edge.i == 0
assert doc[2].left_edge.i == 0
def test_sent_start(en_tokenizer):
doc = en_tokenizer(u'This is a sentence. This is another.')
assert not doc[0].sent_start
assert not doc[5].sent_start
doc[5].sent_start = True
assert doc[5].sent_start
assert not doc[0].sent_start
doc.is_parsed = True
assert len(list(doc.sents)) == 2

View File

@ -24,6 +24,7 @@ from ..typedefs cimport attr_t, flags_t
from ..attrs cimport attr_id_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
from ..attrs cimport SENT_START
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
from ..syntax.iterators import CHUNKERS
from ..util import normalize_slice
@ -52,6 +53,8 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
return token.dep
elif feat_name == HEAD:
return token.head
elif feat_name == SENT_START:
return token.sent_start
elif feat_name == SPACY:
return token.spacy
elif feat_name == ENT_IOB:
@ -559,14 +562,16 @@ cdef class Doc:
for i in range(self.length):
self.c[i] = parsed[i]
def from_array(self, attrs, int[:, :] array):
"""Load attributes from a numpy array. Write to a `Doc` object, from an
`(M, N)` array of attributes.
attrs (ints): A list of attribute ID ints.
array (numpy.ndarray[ndim=2, dtype='int32']) The attribute values to load.
RETURNS (Doc): Itself.
"""
def from_array(self, attrs, array):
if SENT_START in attrs and HEAD in attrs:
raise ValueError(
"Conflicting attributes specified in doc.from_array():\n"
"(HEAD, SENT_START)\n"
"The HEAD attribute currently sets sentence boundaries implicitly,\n"
"based on the tree structure. This means the HEAD attribute would "
"potentially override the sentence boundaries set by SENT_START.\n"
"See https://github.com/spacy-io/spaCy/issues/235 for details and "
"workarounds, and to propose solutions.")
cdef int i, col
cdef attr_id_t attr_id
cdef TokenC* tokens = self.c
@ -593,6 +598,24 @@ cdef class Doc:
self.is_tagged = bool(TAG in attrs or POS in attrs)
return self
def to_disk(self, path):
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
"""
raise NotImplementedError()
def from_disk(self, path):
"""Loads state from a directory. Modifies the object in place and
returns it.
path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
RETURNS (Doc): The modified `Doc` object.
"""
raise NotImplementedError()
def to_bytes(self):
"""Serialize, i.e. export the document contents to a binary string.

View File

@ -279,6 +279,18 @@ cdef class Token:
def __get__(self):
return self.c.r_kids
property sent_start:
def __get__(self):
return self.c.sent_start
def __set__(self, bint value):
if self.doc.is_parsed:
raise ValueError(
'Refusing to write to token.sent_start if its document is parsed, '
'because this may cause inconsistent state. '
'See https://github.com/spacy-io/spaCy/issues/235 for workarounds.')
self.c.sent_start = value
property lefts:
def __get__(self):
"""

View File

@ -42,10 +42,11 @@ mixin icon(name, size)
//- Pro/Con/Neutral icon
icon - [string] "pro", "con" or "neutral" (default: "neutral")
size - [integer] icon size (optional)
mixin procon(icon)
mixin procon(icon, size)
- colors = { pro: "green", con: "red", neutral: "yellow" }
+icon(icon)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes)
+icon(icon, size)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes)
//- Headlines Helper Mixin

View File

@ -103,9 +103,11 @@ mixin button(url, trusted, ...style)
label - [string] aside title (optional or false for no label)
language - [string] language for syntax highlighting (default: "python")
supports basic relevant languages available for PrismJS
icon - [string] icon to display next to code block, mostly used for old/new
height - [integer] optional height to clip code block to
mixin code(label, language, icon)
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : "")&attributes(attributes)
mixin code(label, language, icon, height)
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : "" style=height ? "height: #{height}px" : "")&attributes(attributes)
if label
h4.u-text-label.u-text-label--dark=label
@ -350,7 +352,22 @@ mixin pos-row(tag, pos, morph, desc)
| #[code=m]
+cell.u-text-small=desc
mixin dep-row(label, desc)
+row
+cell #[code=label]
+cell=desc
//- Table rows for linguistic annotations
annots [array] - array of cell content
style [array] array of 1 (display as code) or 0 (display as text)
mixin annotation-row(annots, style)
+row
for cell, i in annots
if style && style[i]
- cell = (typeof(cell) != 'boolean') ? cell : cell ? 'True' : 'False'
+cell #[code=cell]
else
+cell=cell

View File

@ -3,126 +3,126 @@
.text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
.text-medium { fill: #1a1e23; font: 17px "Source Sans Pro" }
.text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro" }
.text-code { fill: #1a1e23; font: bold 12px "Source Code Pro" }
.text-code { fill: #1a1e23; font: 600 12px "Source Code Pro" }
</style>
<ellipse cx="404" cy="203" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8" pointer-events="none"/>
<ellipse cx="404" cy="203" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="text-large" transform="translate(362.5 206.5)" width="81" height="40">Language</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M345 432v242.8" stroke-dasharray="2 2" pointer-events="none"/>
<path fill="#82b366" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M345 680.8l-4-8 4 2 4-2z" pointer-events="none"/>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M345 432v242.8" stroke-dasharray="2 2"/>
<path fill="#82b366" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M345 680.8l-4-8 4 2 4-2z"/>
<rect fill="#fff" transform="translate(324 535.5)" width="37" height="18"/>
<text class="text-small" dy="1em" style="fill: #82b366" transform="translate(324 535.5)" width="37" height="18">MAKES</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M457 434l100.5 80" pointer-events="none"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M562.3 517.6l-8.8-1.8 4-2 1-4.3z" pointer-events="none"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M457 434l100.5 80"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M562.3 517.6l-8.8-1.8 4-2 1-4.3z"/>
<rect fill="#f6f6f6" transform="translate(424.5 462.5)" width="158" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(424.5 462.5)" width="158" height="18">nlp.vocab.morphology</text>
<ellipse cx="404" cy="399" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="74.8" ry="49.8" pointer-events="none"/>
<ellipse cx="404" cy="399" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="text-large" dy="1em" transform="translate(377.5 386.5)" width="51" height="22">Vocab</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 253v87.8" pointer-events="none"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 346.8l-4-8 4 2 4-2z" pointer-events="none"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 253v87.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 346.8l-4-8 4 2 4-2z"/>
<rect fill="#f6f6f6" transform="translate(364.5 285.5)" width="79" height="18" />
<text class="text-code" dy="1em" dx="0.5em" transform="translate(364.5 285.5)" width="79" height="18">nlp.vocab</text>
<ellipse cx="743" cy="399" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8" pointer-events="none"/>
<ellipse cx="743" cy="399" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="text-large" transform="translate(694.5 386.5)" dy="1em" width="95" height="22">StringStore</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M478 399h181.8" pointer-events="none"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M665.8 399l-8 4 2-4-2-4z" pointer-events="none"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M478 399h181.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M665.8 399l-8 4 2-4-2-4z"/>
<rect fill="#f6f6f6" transform="translate(498.5 388.5)" width="137" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(498.5 388.5)" width="137" height="18">nlp.vocab.strings</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M108 244l235.6 115.4" pointer-events="none"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M349 362h-9l3.6-2.6V355z" pointer-events="none"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M108 244l235.6 115.4"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M349 362h-9l3.6-2.6V355z"/>
<rect fill="#f6f6f6" transform="translate(141.5 284.5)" width="151" height="18" />
<text class="text-code" dy="1em" dx="0.5em" transform="translate(141.5 284.5)" width="151" height="18">nlp.tokenizer.vocab</text>
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M38.7 159.3H104l33 43.6-32.8 43.5H38.7L6 203z" pointer-events="none"/>
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M38.7 159.3H104l33 43.6-32.8 43.5H38.7L6 203z"/>
<text class="text-large" transform="translate(30.5 190.5)" dy="1em" width="80" height="22">Tokenizer</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M329 203v-1H145.2" pointer-events="none"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M139.2 202l8-4-2 4 2 4z" pointer-events="none"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M329 203v-1H145.2"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M139.2 202l8-4-2 4 2 4z"/>
<rect fill="#f6f6f6" transform="translate(188.5 191.5)" width="115" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(188.5 191.5)" width="115" height="18">nlp.make_doc()</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M478 203h83v-4h105.8" pointer-events="none"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M672.8 199l-8 4 2-4-2-4z" pointer-events="none"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M478 203h83v-4h105.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M672.8 199l-8 4 2-4-2-4z"/>
<rect fill="#f6f6f6" transform="translate(512.5 191.5)" width="101" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(512.5 191.5)" width="101" height="18">nlp.pipeline</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M709 242.8L464.4 359.4" pointer-events="none"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M459 362l5.5-7v4.4l3.5 2.8z" pointer-events="none"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M709 242.8L464.4 359.4"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M459 362l5.5-7v4.4l3.5 2.8z"/>
<rect fill="#f6f6f6" transform="translate(505.5 297.5)" width="166" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(505.5 297.5)" width="166" height="18">nlp.pipeline[i].vocab</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M275.3 34.6L288.6 1h54L329 34.6z" pointer-events="none"/>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M275.3 34.6L288.6 1h54L329 34.6z"/>
<text class="text-small" dy="0.85em" transform="translate(301.5 9.5)" width="12" height="14">pt</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M60.8 34.6L74.3 1h54l-13.6 33.6z" pointer-events="none"/>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M60.8 34.6L74.3 1h54l-13.6 33.6z"/>
<text class="text-small" dy="0.85em" transform="translate(86.5 9.5)" width="14" height="14">en</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M114.4 34.6L128 1h53.8l-13.5 33.6z" pointer-events="none"/>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M114.4 34.6L128 1h53.8l-13.5 33.6z"/>
<text class="text-small" dy="0.85em" transform="translate(140.5 9.5)" width="14" height="14">de</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M168 34.6L181.5 1h54l-13.6 33.6z" pointer-events="none"/>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M168 34.6L181.5 1h54l-13.6 33.6z"/>
<text class="text-small" dy="0.85em" transform="translate(196.5 9.5)" width="8" height="14">fr</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M221.6 34.6L235 1h54l-13.5 33.6z" pointer-events="none"/>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M221.6 34.6L235 1h54l-13.5 33.6z"/>
<text class="text-small" dy="0.85em" transform="translate(248.5 9.5)" width="12" height="14">es</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M47 68.3l13.6-33.6h53.8L101 68.3z" pointer-events="none"/>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M47 68.3l13.6-33.6h53.8L101 68.3z"/>
<text class="text-small" dy="0.85em" transform="translate(75.5 43.5)" width="8" height="14">it</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M100.7 68.3l13.5-33.6H168l-13.4 33.6z" pointer-events="none"/>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M100.7 68.3l13.5-33.6H168l-13.4 33.6z"/>
<text class="text-small" dy="0.85em" transform="translate(127.5 43.5)" width="12" height="14">nl</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M154.3 68.3l13.5-33.6h53.8l-13.4 33.6z" pointer-events="none"/>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M154.3 68.3l13.5-33.6h53.8l-13.4 33.6z"/>
<text class="text-small" dy="0.85em" transform="translate(180.5 43.5)" width="12" height="14">sv</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M208 68.3l13.4-33.6h53.8L262 68.3z" pointer-events="none"/>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M208 68.3l13.4-33.6h53.8L262 68.3z"/>
<text class="text-small" dy="0.85em" transform="translate(236.5 43.5)" width="8" height="14">fi</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M261.5 68.3L275 34.7h54l-13.6 33.6z" pointer-events="none"/>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M261.5 68.3L275 34.7h54l-13.6 33.6z"/>
<text class="text-small" dy="0.85em" transform="translate(286.5 43.5)" width="16" height="14">nb</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M33.4 102L47 68.2h53.7L87.3 102z" pointer-events="none"/>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M33.4 102L47 68.2h53.7L87.3 102z"/>
<text class="text-small" dy="0.85em" transform="translate(58.5 77.5)" width="16" height="14">hu</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M87 102l13.5-33.7h53.8L141 102z" pointer-events="none"/>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M87 102l13.5-33.7h53.8L141 102z"/>
<text class="text-small" dy="0.85em" transform="translate(112.5 77.5)" width="14" height="14">he</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M140.6 102L154 68.2h54L194.4 102z" pointer-events="none"/>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M140.6 102L154 68.2h54L194.4 102z"/>
<text class="text-small" dy="0.85em" transform="translate(165.5 77.5)" width="16" height="14">bn</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M194.2 102l13.5-33.7h53.8L248 102z" pointer-events="none"/>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M194.2 102l13.5-33.7h53.8L248 102z"/>
<text transform="translate(221.5 77.5)" class="text-small" dy="0.85em" width="10" height="14">ja</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M247.8 102l13.5-33.7H315L301.8 102z" pointer-events="none"/>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M247.8 102l13.5-33.7H315L301.8 102z"/>
<text class="text-small" dy="0.85em" transform="translate(273.5 77.5)" width="14" height="14">zh</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M329 51h75v93.8" pointer-events="none"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 150.8l-4-8 4 2 4-2z" pointer-events="none"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M213 480l109.3-76.3" pointer-events="none"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M327.2 400.3L323 408l-.8-4.3-4-2z" pointer-events="none"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M329 51h75v93.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 150.8l-4-8 4 2 4-2z"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M213 480l109.3-76.3"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M327.2 400.3L323 408l-.8-4.3-4-2z"/>
<rect fill="#f6f6f6" transform="translate(226.5 431.5)" width="79" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(226.5 431.5)" width="79" height="18">doc.vocab</text>
<path fill="none" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M39.6 555.5l.4 121.3" stroke-dasharray="2 2" pointer-events="none"/>
<path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M40 682.8l-4-8 4 2 4-2z" pointer-events="none"/>
<path fill="none" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M39.6 555.5l.4 121.3" stroke-dasharray="2 2"/>
<path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M40 682.8l-4-8 4 2 4-2z"/>
<rect fill="#fff" transform="translate(23.5 604.5)" width="37" height="18"/>
<text class="text-small" style="fill: #9673a6" dy="1em" dx="-0.5em" transform="translate(23.5 604.5)" width="37" height="18">MAKES</text>
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M1 479.5h283v74.8H1z" pointer-events="none"/>
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M1 479.5h283v74.8H1z"/>
<text class="text-large" dy="1em" transform="translate(125.5 504.5)" width="32" height="22">Doc</text>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M71 246v117h1v108.8" stroke-dasharray="2 2" pointer-events="none"/>
<path fill="#c00" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M72 477.8l-4-8 4 2 4-2z" pointer-events="none"/>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M71 246v117h1v108.8" stroke-dasharray="2 2"/>
<path fill="#c00" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M72 477.8l-4-8 4 2 4-2z"/>
<rect fill="#fff" transform="translate(54.5 355.5)" width="37" height="18"/>
<text class="text-small" style="fill: #cc0000" dy="1em" dx="-0.5em" transform="translate(54.5 355.5)" width="37" height="18">MAKES</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M104 685l.4-121.2" pointer-events="none"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M104.5 557.8l4 8-4-2-4 2z" pointer-events="none"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M104 685l.4-121.2"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M104.5 557.8l4 8-4-2-4 2z"/>
<rect fill="#f6f6f6" transform="translate(62.5 632.5)" width="79" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(62.5 632.5)" width="79" height="18">token.doc</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M7.2 685h129.6v50H7.2z" pointer-events="none"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M7.2 685h129.6v50H7.2z"/>
<text class="text-medium" dy="1em" transform="translate(49.5 700.5)" width="43" height="18">Token</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M148 685h129.7v50H148z" pointer-events="none"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M148 685h129.7v50H148z"/>
<text class="text-medium" dy="1em" transform="translate(193.5 700.5)" width="37" height="18">Span</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M405 686V456.6" pointer-events="none"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M405 450.6l4 8-4-2-4 2z" pointer-events="none"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M405 686V456.6"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M405 450.6l4 8-4-2-4 2z"/>
<rect fill="#f6f6f6" transform="translate(356.5 584.5)" width="101" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(356.5 584.5)" width="101" height="18">lexeme.vocab</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M296.7 685h155.8v50H296.7z" pointer-events="none"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M296.7 685h155.8v50H296.7z"/>
<text class="text-medium" dy="1em" transform="translate(346.5 700.5)" width="55" height="18">Lexeme</text>
<path fill="none" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M180.5 559.3l.5 117.5" stroke-dasharray="2 2" pointer-events="none"/>
<path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M181 682.8l-4-8 4 2 4-2z" pointer-events="none"/>
<path fill="none" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M180.5 559.3l.5 117.5" stroke-dasharray="2 2"/>
<path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M181 682.8l-4-8 4 2 4-2z"/>
<rect fill="#fff" transform="translate(164.5 606.5)" width="37" height="18" />
<text class="text-small" style="fill: #9673a6" dy="1em" dx="-0.5em" transform="translate(164.5 606.5)" width="37" height="18">MAKES</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M245.3 685V564" pointer-events="none"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M245.3 557.8l4 8-4-2-4 2z" pointer-events="none"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M245.3 685V564"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M245.3 557.8l4 8-4-2-4 2z"/>
<rect fill="#f6f6f6" transform="translate(211.5 633.5)" width="72" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(211.5 633.5)" width="72" height="18">span.doc</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.6 112H872l32.8 43.5L872 199h-65.4L774 155.6z" pointer-events="none"/>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.6 112H872l32.8 43.5L872 199h-65.4L774 155.6z"/>
<text class="text-medium" dy="1em" transform="translate(794.5 135.5)" width="88" height="38">Dependency <tspan dy="1.25em" dx="-4.1em">Parser</tspan></text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.6 199H872l32.8 43.8-32.8 43.6h-65.4L774 242.8z" pointer-events="none"/>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.6 199H872l32.8 43.8-32.8 43.6h-65.4L774 242.8z"/>
<text class="text-medium" dy="1em" dx="1.1em" transform="translate(799.5 222.5)" width="78" height="38">Entity <tspan dy="1.25em" dx="-3.75em">Recognizer</tspan></text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M708 155.5h65.6l32.7 43.6-32.7 43.8H708L675.5 199z" pointer-events="none"/>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M708 155.5h65.6l32.7 43.6-32.7 43.8H708L675.5 199z"/>
<text class="text-medium" dy="1em" transform="translate(715.5 189.5)" width="48" height="18">Tagger</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.8 24.5h65.5L905 68 872.3 112h-65.5L774 68z" pointer-events="none"/>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.8 24.5h65.5L905 68 872.3 112h-65.5L774 68z"/>
<text class="text-medium" dy="1em" transform="translate(809.5 58.5)" width="58" height="18">Matcher</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M708.6 68H774l32.8 43.5L774 155h-65.4L676 111.6z" pointer-events="none"/>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M708.6 68H774l32.8 43.5L774 155h-65.4L676 111.6z"/>
<text class="text-medium" dy="1em" transform="translate(698.5 101.5)" width="84" height="18">Lemmatizer</text>
<ellipse cx="617" cy="555" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8" pointer-events="none"/>
<ellipse cx="617" cy="555" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="text-large" dy="1em" transform="translate(565.5 542.5)" width="101" height="22">Morphology</text>
</svg>

Before

Width:  |  Height:  |  Size: 15 KiB

After

Width:  |  Height:  |  Size: 14 KiB

View File

@ -0,0 +1,30 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 923 200" width="923" height="200">
<style>
.text { fill: #1a1e23; font: 20px "Source Sans Pro" }
.text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro" }
.text-code { fill: #1a1e23; font: 600 16px "Source Code Pro" }
</style>
<rect width="601" height="127" x="159" y="21" fill="none" stroke="#09a3d5" stroke-width="3" rx="19.1" stroke-dasharray="3 6" ry="19.1"/>
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M801 55h120v60H801z"/>
<text class="text" dy="0.75em" width="28" height="19" transform="translate(846.5 75.5)">Doc</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M121.2 84.7h29.4"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M156.6 84.7l-8 4 2-4-2-4z"/>
<path fill="#f5f5f5" stroke="#999" stroke-width="2" d="M1 55h120v60H1z"/>
<text class="text" dy="0.85em" width="34" height="22" transform="translate(43.5 73.5)">Text</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M760 84.7h33"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M799 84.7l-8 4 2-4-2-4z"/>
<rect width="75" height="39" x="422" y="1" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="5.8" ry="5.8"/>
<text class="text-code" dy="0.8em" dx="0.1em" width="29" height="17" transform="translate(444.5 11.5)">nlp</text>
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M176 58h103.3L296 88l-16.8 30H176l16.8-30z"/>
<text class="text-small" dy="0.75em" dx="-0.25em" width="58" height="14" transform="translate(206.5 80.5)">tokenizer</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M314 58h103.3L434 88l-16.8 30H314l16.8-30z"/>
<text class="text-small" dy="0.75em" dx="-0.25em" width="62" height="14" transform="translate(342.5 80.5)">vectorizer</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M296.5 88.2h24.7"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M327.2 88.2l-8 4 2-4-2-4z"/>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M416 58h103.3L536 88l-16.8 30H416l16.8-30z"/>
<text class="text-small" dy="0.75em" dx="-0.25em" width="40" height="14" transform="translate(455.5 80.5)">tagger</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M519 58h103.3L639 88l-16.8 30H519l16.8-30z"/>
<text class="text-small" dy="0.75em" dx="-0.25em" width="40" height="14" transform="translate(558.5 80.5)">parser</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M622 58h103.3L742 88l-16.8 30H622l16.8-30z"/>
<text class="text-small" dy="0.75em" dx="-0.25em" width="20" height="14" transform="translate(671.5 80.5)">ner</text>
</svg>

After

Width:  |  Height:  |  Size: 2.9 KiB

View File

@ -1,10 +1,5 @@
//- 💫 DOCS > API > ANNOTATION > DEPENDENCY LABELS
+infobox("Tip")
| In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the
| description for the string representation of a label. For example,
| #[code spacy.explain("prt")] will return "particle".
+h(3, "dependency-parsing-english") English dependency labels
p

View File

@ -1,10 +1,5 @@
//- 💫 DOCS > API > ANNOTATION > NAMED ENTITIES
+infobox("Tip")
| In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the
| description for the string representation of an entity label. For example,
| #[code spacy.explain("LANGUAGE")] will return "any named language".
+table([ "Type", "Description" ])
+row
+cell #[code PERSON]

View File

@ -1,10 +1,5 @@
//- 💫 DOCS > API > ANNOTATION > POS TAGS
+infobox("Tip")
| In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the
| description for the string representation of a tag. For example,
| #[code spacy.explain("RB")] will return "adverb".
+h(3, "pos-tagging-english") English part-of-speech tag scheme
p

View File

@ -27,8 +27,7 @@
"GoldCorpus": "goldcorpus"
},
"Other": {
"Annotation Specs": "annotation",
"Feature Scheme": "features"
"Annotation Specs": "annotation"
}
},
@ -143,9 +142,5 @@
"annotation": {
"title": "Annotation Specifications"
},
"features": {
"title": "Linear Model Feature Scheme"
}
}

View File

@ -14,11 +14,12 @@ p
| (#[code ' ']) is included as a token.
+aside-code("Example").
from spacy.en import English
nlp = English(parser=False)
from spacy.lang.en import English
nlp = English()
tokens = nlp('Some\nspaces and\ttab characters')
print([t.orth_ for t in tokens])
# ['Some', '\n', 'spaces', ' ', 'and', '\t', 'tab', 'characters']
tokens_text = [t.text for t in tokens]
assert tokens_text == ['Some', '\n', 'spaces', ' ', 'and',
'\t', 'tab', 'characters']
p
| The whitespace tokens are useful for much the same reason punctuation is
@ -38,6 +39,11 @@ p
+h(2, "pos-tagging") Part-of-speech Tagging
+aside("Tip: Understanding tags")
| You can also use #[code spacy.explain()] to get the escription for the
| string representation of a tag. For example,
| #[code spacy.explain("RB")] will return "adverb".
include _annotation/_pos-tags
+h(2, "lemmatization") Lemmatization
@ -50,25 +56,35 @@ p A "lemma" is the uninflected form of a word. In English, this means:
+item #[strong Nouns]: The form like "dog", not "dogs"; like "child", not "children"
+item #[strong Verbs]: The form like "write", not "writes", "writing", "wrote" or "written"
+aside("About spaCy's custom pronoun lemma")
| Unlike verbs and common nouns, there's no clear base form of a personal
| pronoun. Should the lemma of "me" be "I", or should we normalize person
| as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a
| novel symbol, #[code.u-nowrap -PRON-], which is used as the lemma for
| all personal pronouns.
p
| The lemmatization data is taken from
| #[+a("https://wordnet.princeton.edu") WordNet]. However, we also add a
| special case for pronouns: all pronouns are lemmatized to the special
| token #[code -PRON-].
+infobox("About spaCy's custom pronoun lemma")
| Unlike verbs and common nouns, there's no clear base form of a personal
| pronoun. Should the lemma of "me" be "I", or should we normalize person
| as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a
| novel symbol, #[code -PRON-], which is used as the lemma for
| all personal pronouns.
+h(2, "dependency-parsing") Syntactic Dependency Parsing
+aside("Tip: Understanding labels")
| You can also use #[code spacy.explain()] to get the description for the
| string representation of a label. For example,
| #[code spacy.explain("prt")] will return "particle".
include _annotation/_dep-labels
+h(2, "named-entities") Named Entity Recognition
+aside("Tip: Understanding entity types")
| You can also use #[code spacy.explain()] to get the description for the
| string representation of an entity label. For example,
| #[code spacy.explain("LANGUAGE")] will return "any named language".
include _annotation/_named-entities
+h(3, "biluo") BILUO Scheme

View File

@ -253,6 +253,44 @@ p
+cell #[code Doc]
+cell Itself.
+h(2, "to_disk") Doc.to_disk
+tag method
p Save the current state to a directory.
+aside-code("Example").
doc.to_disk('/path/to/doc')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects.
+h(2, "from_disk") Doc.from_disk
+tag method
p Loads state from a directory. Modifies the object in place and returns it.
+aside-code("Example").
from spacy.tokens import Doc
doc = Doc().from_disk('/path/to/doc')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory. Paths may be either strings or
| #[code Path]-like objects.
+footrow
+cell returns
+cell #[code Doc]
+cell The modified #[code Doc] object.
+h(2, "to_bytes") Doc.to_bytes
+tag method

View File

@ -1,138 +0,0 @@
//- 💫 DOCS > API > LINEAR MOEL FEATURES
include ../../_includes/_mixins
p
| There are two popular strategies for putting together machine learning
| models for NLP: sparse linear models, and neural networks. To solve NLP
| problems with linear models, feature templates need to be assembled that
| combine multiple atomic predictors. This page documents the atomic
| predictors used in the spaCy 1.0 #[+api("parser") #[code Parser]],
| #[+api("tagger") #[code Tagger]] and
| #[+api("entityrecognizer") #[code EntityRecognizer]].
p
| To understand the scheme, recall that spaCy's #[code Parser] and
| #[code EntityRecognizer] are implemented as push-down automata. They
| maintain a "stack" that holds the current entity, and a "buffer"
| consisting of the words to be processed.
p
| Each state consists of the words on the stack (if any), which consistute
| the current entity being constructed. We also have the current word, and
| the two subsequent words. Finally, we also have the entities previously
| built.
p
| This gives us a number of tokens to ask questions about, to make the
| features. About each of these tokens, we can ask about a number of
| different properties. Each feature identifier asks about a specific
| property of a specific token of the context.
+h(2, "tokens") Context tokens
+table([ "ID", "Description" ])
+row
+cell #[code S0]
+cell
| The first word on the stack, i.e. the token most recently added
| to the current entity.
+row
+cell #[code S1]
+cell The second word on the stack, i.e. the second most recently added.
+row
+cell #[code S2]
+cell The third word on the stack, i.e. the third most recently added.
+row
+cell #[code N0]
+cell The first word of the buffer, i.e. the current word being tagged.
+row
+cell #[code N1]
+cell The second word of the buffer.
+row
+cell #[code N2]
+cell The third word of the buffer.
+row
+cell #[code P1]
+cell The word immediately before #[code N0].
+row
+cell #[code P2]
+cell The second word before #[code N0].
+row
+cell #[code E0]
+cell The first word of the previously constructed entity.
+row
+cell #[code E1]
+cell The first word of the second previously constructed entity.
p About each of these tokens, we can ask:
+table([ "ID", "Attribute", "Description" ])
+row
+cell #[code N0w]
+cell #[code token.orth]
+cell The word form.
+row
+cell #[code N0W]
+cell #[code token.lemma]
+cell The word's lemma.
+row
+cell #[code N0p]
+cell #[code token.tag]
+cell The word's (full) POS tag.
+row
+cell #[code N0c]
+cell #[code token.cluster]
+cell The word's (full) Brown cluster.
+row
+cell #[code N0c4]
+cell -
+cell First four digit prefix of the word's Brown cluster.
+row
+cell #[code N0c6]
+cell -
+cell First six digit prefix of the word's Brown cluster.
+row
+cell #[code N0L]
+cell -
+cell The word's dependency label. Not used as a feature in the NER.
+row
+cell #[code N0_prefix]
+cell #[code token.prefix]
+cell The first three characters of the word.
+row
+cell #[code N0_suffix]
+cell #[code token.suffix]
+cell The last three characters of the word.
+row
+cell #[code N0_shape]
+cell #[code token.shape]
+cell The word's shape, i.e. is it alphabetic, numeric, etc.
+row
+cell #[code N0_ne_iob]
+cell #[code token.ent_iob]
+cell The Inside/Outside/Begin code of the word's NER tag.
+row
+cell #[code N0_ne_type]
+cell #[code token.ent_type]
+cell The word's NER type.

View File

@ -1,14 +0,0 @@
//- 💫 DOCS > API > PHILOSOPHY
include ../../_includes/_mixins
p Every product needs to know why it exists. Here's what we're trying to with spaCy and why it's different from other NLP libraries.
+h(2) 1. No job too big.
p Most programs get cheaper to run over time, but NLP programs often get more expensive. The data often grows faster than the hardware improves. For web-scale tasks, Moore's law can't save us — so if we want to read the web, we have to sweat performance.
+h(2) 2. Take a stand.
p Most NLP toolkits position themselves as platforms, rather than libraries. They offer a pluggable architecture, and leave it to the user to arrange the components they offer into a useful system. This is fine for researchers, but for production users, this does too little. Components go out of date quickly, and configuring a good system takes very detailed knowledge. Compatibility problems can be extremely subtle. spaCy is therefore extremely opinionated. The API does not expose any algorithmic details. You're free to configure another pipeline, but the core library eliminates redundancy, and only offers one choice of each component.
+h(2) 3. Stay current.
p There's often significant improvement in NLP models year-on-year. This has been especially true recently, given the success of deep learning models. With spaCy, you should be able to build things you couldn't build yesterday. To deliver on that promise, we need to be giving you the latest stuff.

View File

@ -33,6 +33,11 @@ p
+cell unicode or #[code Path]
+cell Model to load, i.e. shortcut link, package name or path.
+row
+cell #[code **overrides]
+cell -
+cell Override or disable components.
+footrow
+cell returns
+cell #[code Language]

View File

@ -338,8 +338,10 @@ p The L2 norm of the token's vector representation.
+cell #[code ent_iob]
+cell int
+cell
| IOB code of named entity tag.
| #[code 1="I", 2="O", 3="B"]. #[code 0] means no tag is assigned.
| IOB code of named entity tag. #[code "B"]
| means the token begins an entity, #[code "I"] means it is inside
| an entity, #[code "O"] means it is outside an entity, and
| #[code ""] means no entity tag is set.
+row
+cell #[code ent_iob_]

View File

@ -225,7 +225,7 @@ p
p
| Print a formatted, text-wrapped message with optional title. If a text
| argument is a #[code Path], it's converted to a string. Should only
| be used for interactive components like the #[+a("/docs/usage/cli") CLI].
| be used for interactive components like the #[+api("cli") cli].
+aside-code("Example").
data_path = Path('/some/path')

View File

@ -3,28 +3,25 @@
"Get started": {
"Installation": "./",
"Models": "models",
"spaCy 101": "spacy-101",
"Lightning tour": "lightning-tour",
"Visualizers": "visualizers",
"Troubleshooting": "troubleshooting",
"What's new in v2.0": "v2"
},
"Workflows": {
"spaCy 101": "spacy-101",
"Loading the pipeline": "language-processing-pipeline",
"Processing text": "processing-text",
"spaCy's data model": "data-model",
"Guides": {
"POS tagging": "pos-tagging",
"Using the parse": "dependency-parse",
"Entity recognition": "entity-recognition",
"Custom pipelines": "customizing-pipeline",
"Rule-based matching": "rule-based-matching",
"Word vectors": "word-vectors-similarities",
"Deep learning": "deep-learning",
"Custom tokenization": "customizing-tokenizer",
"Rule-based matching": "rule-based-matching",
"Adding languages": "adding-languages",
"Processing pipelines": "language-processing-pipeline",
"Deep learning": "deep-learning",
"Production use": "production-use",
"Training": "training",
"Training NER": "training-ner",
"Saving & loading": "saving-loading"
"Saving & loading": "saving-loading",
"Visualizers": "visualizers"
},
"Examples": {
"Tutorials": "tutorials",
@ -38,55 +35,33 @@
"quickstart": true
},
"v2": {
"title": "What's new in v2.0"
},
"models": {
"title": "Models",
"next": "lightning-tour",
"next": "spacy-101",
"quickstart": true
},
"spacy-101": {
"title": "spaCy 101",
"next": "lightning-tour"
},
"lightning-tour": {
"title": "Lightning tour",
"next": "spacy-101"
"next": "v2"
},
"visualizers": {
"title": "Visualizers"
},
"troubleshooting": {
"title": "Troubleshooting",
"next": "resources"
"v2": {
"title": "What's new in v2.0"
},
"resources": {
"title": "Resources"
},
"spacy-101": {
"title": "spaCy 101"
},
"language-processing-pipeline": {
"title": "Loading a language processing pipeline",
"next": "processing-text"
},
"customizing-pipeline": {
"title": "Customizing the pipeline",
"next": "customizing-tokenizer"
},
"processing-text": {
"title": "Processing text",
"next": "data-model"
},
"data-model": {
"title": "Understanding spaCy's data model"
"pos-tagging": {
"title": "Part-of-speech tagging",
"next": "dependency-parse"
},
"dependency-parse": {
@ -95,27 +70,45 @@
},
"entity-recognition": {
"title": "Entity recognition",
"next": "rule-based-matching"
},
"rule-based-matching": {
"title": "Rule-based matching"
"title": "Named Entity Recognition",
"next": "training-ner"
},
"word-vectors-similarities": {
"title": "Using word vectors and semantic similarities"
},
"deep-learning": {
"title": "Hooking a deep learning model into spaCy"
"title": "Using word vectors and semantic similarities",
"next": "customizing-tokenizer"
},
"customizing-tokenizer": {
"title": "Customizing the tokenizer",
"next": "rule-based-matching"
},
"rule-based-matching": {
"title": "Rule-based matching",
"next": "adding-languages"
},
"adding-languages": {
"title": "Adding languages",
"next": "training"
},
"language-processing-pipeline": {
"title": "Language processing pipelines",
"next": "deep-learning"
},
"deep-learning": {
"title": "Hooking a deep learning model into spaCy",
"next": "production use"
},
"production-use": {
"title": "Production use",
"next": "training"
},
"training": {
"title": "Training spaCy's statistical models",
"next": "saving-loading"
@ -127,17 +120,7 @@
},
"saving-loading": {
"title": "Saving and loading models"
},
"pos-tagging": {
"title": "Part-of-speech tagging",
"next": "dependency-parse"
},
"adding-languages": {
"title": "Adding languages",
"next": "training"
"title": "Saving, loading and data serialization"
},
"showcase": {

View File

@ -0,0 +1,38 @@
//- 💫 DOCS > USAGE > SPACY 101 > NAMED ENTITIES
p
| A named entity is a "real-world object" that's assigned a name for
| example, a person, a country, a product or a book title. spaCy can
| #[strong recognise] #[+a("/docs/api/annotation#named-entities") various types]
| of named entities in a document, by asking the model for a
| #[strong prediction]. Because models are statistical and strongly depend
| on the examples they were trained on, this doesn't always work
| #[em perfectly] and might need some tuning later, depending on your use
| case.
p
| Named entities are available as the #[code ents] property of a #[code Doc]:
+code.
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
for ent in doc.ents:
print(ent.text, ent.start_char, ent.end_char, ent.label_)
+aside
| #[strong Text]: The original entity text.#[br]
| #[strong Start]: Index of start of entity in the #[code Doc].#[br]
| #[strong End]: Index of end of entity in the #[code Doc].#[br]
| #[strong Label]: Entity label, i.e. type.
+table(["Text", "Start", "End", "Label", "Description"])
- var style = [0, 1, 1, 1, 0]
+annotation-row(["Apple", 0, 5, "ORG", "Companies, agencies, institutions."], style)
+annotation-row(["U.K.", 27, 31, "GPE", "Geopolitical entity, i.e. countries, cities, states."], style)
+annotation-row(["$1 billion", 44, 54, "MONEY", "Monetary values, including unit."], style)
p
| Using spaCy's built-in #[+a("/docs/usage/visualizers") displaCy visualizer],
| here's what our example sentence and its named entities look like:
+codepen("2f2ad1408ff79fc6a326ea3aedbb353b", 160)

View File

@ -0,0 +1,50 @@
//- 💫 DOCS > USAGE > SPACY 101 > PIPELINES
p
| When you call #[code nlp] on a text, spaCy first tokenizes the text to
| produce a #[code Doc] object. The #[code Doc] is then processed in several
| different steps this is also referred to as the
| #[strong processing pipeline]. The pipeline used by the
| #[+a("/docs/usage/models") default models] consists of a
| vectorizer, a tagger, a parser and an entity recognizer. Each pipeline
| component returns the processed #[code Doc], which is then passed on to
| the next component.
+image
include ../../../assets/img/docs/pipeline.svg
.u-text-right
+button("/assets/img/docs/pipeline.svg", false, "secondary").u-text-tag View large graphic
+aside
| #[strong Name:] ID of the pipeline component.#[br]
| #[strong Component:] spaCy's implementation of the component.#[br]
| #[strong Creates:] Objects, attributes and properties modified and set by
| the component.
+table(["Name", "Component", "Creates"])
+row
+cell tokenizer
+cell #[+api("tokenizer") #[code Tokenizer]]
+cell #[code Doc]
+row("divider")
+cell vectorizer
+cell #[code Vectorizer]
+cell #[code Doc.tensor]
+row
+cell tagger
+cell #[+api("tagger") #[code Tagger]]
+cell #[code Doc[i].tag]
+row
+cell parser
+cell #[+api("dependencyparser") #[code DependencyParser]]
+cell
| #[code Doc[i].head], #[code Doc[i].dep], #[code Doc.sents],
| #[code Doc.noun_chunks]
+row
+cell ner
+cell #[+api("entityrecognizer") #[code EntityRecognizer]]
+cell #[code Doc.ents], #[code Doc[i].ent_iob], #[code Doc[i].ent_type]

View File

@ -0,0 +1,62 @@
//- 💫 DOCS > USAGE > SPACY 101 > POS TAGGING AND DEPENDENCY PARSING
p
| After tokenization, spaCy can also #[strong parse] and #[strong tag] a
| given #[code Doc]. This is where the statistical model comes in, which
| enables spaCy to #[strong make a prediction] of which tag or label most
| likely applies in this context. A model consists of binary data and is
| produced by showing a system enough examples for it to make predictions
| that generalise across the language for example, a word following "the"
| in English is most likely a noun.
p
| Linguistic annotations are available as
| #[+api("token#attributes") #[code Token] attributes]. Like many NLP
| libraries, spaCy #[strong encodes all strings to integers] to reduce
| memory usage and improve efficiency. So to get the readable string
| representation of an attribute, we need to add an underscore #[code _]
| to its name:
+code.
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
for token in doc:
print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
token.shape_, token.is_alpha, token.is_stop)
+aside
| #[strong Text:] The original word text.#[br]
| #[strong Lemma:] The base form of the word.#[br]
| #[strong POS:] The simple part-of-speech tag.#[br]
| #[strong Tag:] The detailed part-of-speech tag.#[br]
| #[strong Dep:] Syntactic dependency, i.e. the relation between tokens.#[br]
| #[strong Shape:] The word shape capitalisation, punctuation, digits.#[br]
| #[strong is alpha:] Is the token an alpha character?#[br]
| #[strong is stop:] Is the token part of a stop list, i.e. the most common
| words of the language?#[br]
+table(["Text", "Lemma", "POS", "Tag", "Dep", "Shape", "alpha", "stop"])
- var style = [0, 0, 1, 1, 1, 1, 1, 1]
+annotation-row(["Apple", "apple", "PROPN", "NNP", "nsubj", "Xxxxx", true, false], style)
+annotation-row(["is", "be", "VERB", "VBZ", "aux", "xx", true, true], style)
+annotation-row(["looking", "look", "VERB", "VBG", "ROOT", "xxxx", true, false], style)
+annotation-row(["at", "at", "ADP", "IN", "prep", "xx", true, true], style)
+annotation-row(["buying", "buy", "VERB", "VBG", "pcomp", "xxxx", true, false], style)
+annotation-row(["U.K.", "u.k.", "PROPN", "NNP", "compound", "X.X.", false, false], style)
+annotation-row(["startup", "startup", "NOUN", "NN", "dobj", "xxxx", true, false], style)
+annotation-row(["for", "for", "ADP", "IN", "prep", "xxx", true, true], style)
+annotation-row(["$", "$", "SYM", "$", "quantmod", "$", false, false], style)
+annotation-row(["1", "1", "NUM", "CD", "compound", "d", false, false], style)
+annotation-row(["billion", "billion", "NUM", "CD", "pobj", "xxxx", true, false], style)
+aside("Tip: Understanding tags and labels")
| Most of the tags and labels look pretty abstract, and they vary between
| languages. #[code spacy.explain()] will show you a short description
| for example, #[code spacy.explain("VBZ")] returns "verb, 3rd person
| singular present".
p
| Using spaCy's built-in #[+a("/docs/usage/visualizers") displaCy visualizer],
| here's what our example sentence and its dependencies look like:
+codepen("030d1e4dfa6256cad8fdd59e6aefecbe", 460)

View File

@ -0,0 +1,40 @@
//- 💫 DOCS > USAGE > SPACY 101 > SERIALIZATION
p
| If you've been modifying the pipeline, vocabulary vectors and entities, or made
| updates to the model, you'll eventually want
| to #[strong save your progress] for example, everything that's in your #[code nlp]
| object. This means you'll have to translate its contents and structure
| into a format that can be saved, like a file or a byte string. This
| process is called serialization. spaCy comes with
| #[strong built-in serialization methods] and supports the
| #[+a("http://www.diveintopython3.net/serializing.html#dump") Pickle protocol].
+aside("What's pickle?")
| Pickle is Python's built-in object persistance system. It lets you
| transfer arbitrary Python objects between processes. This is usually used
| to load an object to and from disk, but it's also used for distributed
| computing, e.g. with
| #[+a("https://spark.apache.org/docs/0.9.0/python-programming-guide.html") PySpark]
| or #[+a("http://dask.pydata.org/en/latest/") Dask]. When you unpickle an
| object, you're agreeing to execute whatever code it contains. It's like
| calling #[code eval()] on a string so don't unpickle objects from
| untrusted sources.
p
| All container classes and pipeline components, i.e.
for cls in ["Doc", "Language", "Tokenizer", "Tagger", "DependencyParser", "EntityRecognizer", "Vocab", "StringStore"]
| #[+api(cls.toLowerCase()) #[code=cls]],
| have the following methods available:
+table(["Method", "Returns", "Example"])
- style = [1, 0, 1]
+annotation-row(["to_bytes", "bytes", "nlp.to_bytes()"], style)
+annotation-row(["from_bytes", "object", "nlp.from_bytes(bytes)"], style)
+annotation-row(["to_disk", "-", "nlp.to_disk('/path')"], style)
+annotation-row(["from_disk", "object", "nlp.from_disk('/path')"], style)
+code.
moby_dick = open('moby_dick.txt', 'r') # open a large document
doc = nlp(moby_dick) # process it
doc.to_disk('/moby_dick.bin') # save the processed Doc

View File

@ -0,0 +1,44 @@
//- 💫 DOCS > USAGE > SPACY 101 > SIMILARITY
p
| spaCy is able to compare two objects, and make a prediction of
| #[strong how similar they are]. Predicting similarity is useful for
| building recommendation systems or flagging duplicates. For example, you
| can suggest a user content that's similar to what they're currently
| looking at, or label a support ticket as a duplicate, if it's very
| similar to an already existing one.
p
| Each #[code Doc], #[code Span] and #[code Token] comes with a
| #[+api("token#similarity") #[code .similarity()]] method that lets you
| compare it with another object, and determine the similarity. Of course
| similarity is always subjective whether "dog" and "cat" are similar
| really depends on how you're looking at it. spaCy's similarity model
| usually assumes a pretty general-purpose definition of similarity.
+code.
tokens = nlp(u'dog cat banana')
for token1 in tokens:
for token2 in tokens:
print(token1.similarity(token2))
+aside
| #[strong #[+procon("neutral", 16)] similarity:] identical#[br]
| #[strong #[+procon("pro", 16)] similarity:] similar (higher is more similar) #[br]
| #[strong #[+procon("con", 16)] similarity:] dissimilar (lower is less similar)
+table(["", "dog", "cat", "banana"])
each cells, label in {"dog": [1.00, 0.80, 0.24], "cat": [0.80, 1.00, 0.28], "banana": [0.24, 0.28, 1.00]}
+row
+cell.u-text-label.u-color-theme=label
for cell in cells
+cell #[code=cell.toFixed(2)]
| #[+procon(cell < 0.5 ? "con" : cell != 1 ? "pro" : "neutral")]
p
| In this case, the model's predictions are pretty on point. A dog is very
| similar to a cat, whereas a banana is not very similar to either of them.
| Identical tokens are obviously 100% similar to each other (just not always
| exactly #[code 1.0], because of vector math and floating point
| imprecisions).

View File

@ -0,0 +1,18 @@
//- 💫 DOCS > USAGE > SPACY 101 > TOKENIZATION
p
| During processing, spaCy first #[strong tokenizes] the text, i.e.
| segments it into words, punctuation and so on. This is done by applying
| rules specific to each language. For example, punctuation at the end of a
| sentence should be split off whereas "U.K." should remain one token.
| Each #[code Doc] consists of individual tokens, and we can simply iterate
| over them:
+code.
for token in doc:
print(token.text)
+table([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).u-text-center
+row
for cell in ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "for", "$", "1", "billion"]
+cell=cell

View File

@ -0,0 +1,3 @@
//- 💫 DOCS > USAGE > SPACY 101 > TRAINING
p

View File

@ -0,0 +1,152 @@
//- 💫 DOCS > USAGE > SPACY 101 > WORD VECTORS
p
| Similarity is determined by comparing #[strong word vectors] or "word
| embeddings", multi-dimensional meaning representations of a word. Word
| vectors can be generated using an algorithm like
| #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. Most of spaCy's
| #[+a("/docs/usage/models") default models] come with
| #[strong 300-dimensional vectors] that look like this:
+code("banana.vector", false, false, 250).
array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01,
3.28450017e-02, -4.19569999e-01, 7.20689967e-02,
-3.74760002e-01, 5.74599989e-02, -1.24009997e-02,
5.29489994e-01, -5.23800015e-01, -1.97710007e-01,
-3.41470003e-01, 5.33169985e-01, -2.53309999e-02,
1.73800007e-01, 1.67720005e-01, 8.39839995e-01,
5.51070012e-02, 1.05470002e-01, 3.78719985e-01,
2.42750004e-01, 1.47449998e-02, 5.59509993e-01,
1.25210002e-01, -6.75960004e-01, 3.58420014e-01,
-4.00279984e-02, 9.59490016e-02, -5.06900012e-01,
-8.53179991e-02, 1.79800004e-01, 3.38669986e-01,
1.32300004e-01, 3.10209990e-01, 2.18779996e-01,
1.68530002e-01, 1.98740005e-01, -5.73849976e-01,
-1.06490001e-01, 2.66689986e-01, 1.28380001e-01,
-1.28030002e-01, -1.32839993e-01, 1.26570001e-01,
8.67229998e-01, 9.67210010e-02, 4.83060002e-01,
2.12709993e-01, -5.49900010e-02, -8.24249983e-02,
2.24079996e-01, 2.39749998e-01, -6.22599982e-02,
6.21940017e-01, -5.98999977e-01, 4.32009995e-01,
2.81430006e-01, 3.38420011e-02, -4.88150001e-01,
-2.13589996e-01, 2.74010003e-01, 2.40950003e-01,
4.59500015e-01, -1.86049998e-01, -1.04970002e+00,
-9.73049998e-02, -1.89080000e-01, -7.09290028e-01,
4.01950002e-01, -1.87680006e-01, 5.16870022e-01,
1.25200003e-01, 8.41499984e-01, 1.20970003e-01,
8.82389992e-02, -2.91959997e-02, 1.21510006e-03,
5.68250008e-02, -2.74210006e-01, 2.55640000e-01,
6.97930008e-02, -2.22580001e-01, -3.60060006e-01,
-2.24020004e-01, -5.36990017e-02, 1.20220006e+00,
5.45350015e-01, -5.79980016e-01, 1.09049998e-01,
4.21669990e-01, 2.06619993e-01, 1.29360005e-01,
-4.14570011e-02, -6.67770028e-01, 4.04670000e-01,
-1.52179999e-02, -2.76400000e-01, -1.56110004e-01,
-7.91980028e-02, 4.00369987e-02, -1.29439995e-01,
-2.40900001e-04, -2.67850012e-01, -3.81150007e-01,
-9.72450018e-01, 3.17259997e-01, -4.39509988e-01,
4.19340014e-01, 1.83530003e-01, -1.52600005e-01,
-1.08080000e-01, -1.03579998e+00, 7.62170032e-02,
1.65189996e-01, 2.65259994e-04, 1.66160002e-01,
-1.52810007e-01, 1.81229994e-01, 7.02740014e-01,
5.79559989e-03, 5.16639985e-02, -5.97449988e-02,
-2.75510013e-01, -3.90489995e-01, 6.11319989e-02,
5.54300010e-01, -8.79969969e-02, -4.16810006e-01,
3.28260005e-01, -5.25489986e-01, -4.42880005e-01,
8.21829960e-03, 2.44859993e-01, -2.29819998e-01,
-3.49810004e-01, 2.68940002e-01, 3.91660005e-01,
-4.19039994e-01, 1.61909997e-01, -2.62630010e+00,
6.41340017e-01, 3.97430003e-01, -1.28680006e-01,
-3.19460005e-01, -2.56330013e-01, -1.22199997e-01,
3.22750002e-01, -7.99330026e-02, -1.53479993e-01,
3.15050006e-01, 3.05909991e-01, 2.60120004e-01,
1.85530007e-01, -2.40429997e-01, 4.28860001e-02,
4.06219989e-01, -2.42559999e-01, 6.38700008e-01,
6.99829996e-01, -1.40430003e-01, 2.52090007e-01,
4.89840001e-01, -6.10670000e-02, -3.67659986e-01,
-5.50890028e-01, -3.82649988e-01, -2.08430007e-01,
2.28320003e-01, 5.12179971e-01, 2.78679997e-01,
4.76520002e-01, 4.79510017e-02, -3.40079993e-01,
-3.28729987e-01, -4.19669986e-01, -7.54989982e-02,
-3.89539987e-01, -2.96219997e-02, -3.40700001e-01,
2.21699998e-01, -6.28560036e-02, -5.19029975e-01,
-3.77739996e-01, -4.34770016e-03, -5.83010018e-01,
-8.75459984e-02, -2.39289999e-01, -2.47109994e-01,
-2.58870006e-01, -2.98940003e-01, 1.37150005e-01,
2.98919994e-02, 3.65439989e-02, -4.96650010e-01,
-1.81600004e-01, 5.29389977e-01, 2.19919994e-01,
-4.45140004e-01, 3.77979994e-01, -5.70620000e-01,
-4.69460003e-02, 8.18059966e-02, 1.92789994e-02,
3.32459986e-01, -1.46200001e-01, 1.71560004e-01,
3.99809986e-01, 3.62170011e-01, 1.28160000e-01,
3.16439986e-01, 3.75690013e-01, -7.46899992e-02,
-4.84800003e-02, -3.14009994e-01, -1.92860007e-01,
-3.12940001e-01, -1.75529998e-02, -1.75139993e-01,
-2.75870003e-02, -1.00000000e+00, 1.83870003e-01,
8.14339995e-01, -1.89129993e-01, 5.09989977e-01,
-9.19600017e-03, -1.92950002e-03, 2.81890005e-01,
2.72470005e-02, 4.34089988e-01, -5.49669981e-01,
-9.74259973e-02, -2.45399997e-01, -1.72030002e-01,
-8.86500031e-02, -3.02980006e-01, -1.35910004e-01,
-2.77649999e-01, 3.12860007e-03, 2.05559999e-01,
-1.57720000e-01, -5.23079991e-01, -6.47010028e-01,
-3.70139986e-01, 6.93930015e-02, 1.14009999e-01,
2.75940001e-01, -1.38750002e-01, -2.72680014e-01,
6.68910027e-01, -5.64539991e-02, 2.40170002e-01,
-2.67300010e-01, 2.98599988e-01, 1.00830004e-01,
5.55920005e-01, 3.28489989e-01, 7.68579990e-02,
1.55279994e-01, 2.56359994e-01, -1.07720003e-01,
-1.23590000e-01, 1.18270002e-01, -9.90289971e-02,
-3.43279988e-01, 1.15019999e-01, -3.78080010e-01,
-3.90120000e-02, -3.45930010e-01, -1.94040000e-01,
-3.35799992e-01, -6.23340011e-02, 2.89189994e-01,
2.80319989e-01, -5.37410021e-01, 6.27939999e-01,
5.69549985e-02, 6.21469975e-01, -2.52819985e-01,
4.16700006e-01, -1.01079997e-02, -2.54339993e-01,
4.00029987e-01, 4.24320012e-01, 2.26720005e-01,
1.75530002e-01, 2.30489999e-01, 2.83230007e-01,
1.38820007e-01, 3.12180002e-03, 1.70570001e-01,
3.66849989e-01, 2.52470002e-03, -6.40089989e-01,
-2.97650009e-01, 7.89430022e-01, 3.31680000e-01,
-1.19659996e+00, -4.71559986e-02, 5.31750023e-01], dtype=float32)
p
| The #[code .vector] attribute will return an object's vector.
| #[+api("doc#vector") #[code Doc.vector]] and
| #[+api("span#vector") #[code Span.vector]] will default to an average
| of their token vectors. You can also check if a token has a vector
| assigned, and get the L2 norm, which can be used to normalise
| vectors.
+code.
tokens = nlp(u'dog cat banana sasquatch')
for token in tokens:
print(token.text, token.has_vector, token.vector_norm, token.is_oov)
+aside
| #[strong Text]: The original token text.#[br]
| #[strong has vector]: Does the token have a vector representation?#[br]
| #[strong Vector norm]: The L2 norm of the token's vector (the square root
| of the sum of the values squared)#[br]
| #[strong is OOV]: Is the word out-of-vocabulary?
+table(["Text", "Has vector", "Vector norm", "OOV"])
- var style = [0, 1, 1, 1]
+annotation-row(["dog", true, 7.033672992262838, false], style)
+annotation-row(["cat", true, 6.68081871208896, false], style)
+annotation-row(["banana", true, 6.700014292148571, false], style)
+annotation-row(["sasquatch", false, 0, true], style)
p
| The words "dog", "cat" and "banana" are all pretty common in English, so
| they're part of the model's vocabulary, and come with a vector. The word
| "sasquatch" on the other hand is a lot less common and out-of-vocabulary
| so its vector representation consists of 300 dimensions of #[code 0],
| which means it's practically nonexistent.
p
| If your application will benefit from a large vocabulary with more
| vectors, you should consider using one of the
| #[+a("/docs/usage/models#available") larger models] instead of the default,
| smaller ones, which usually come with a clipped vocabulary.

View File

@ -104,6 +104,9 @@ p
+image
include ../../assets/img/docs/language_data.svg
.u-text-right
+button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic
+table(["File name", "Variables", "Description"])
+row
@ -436,6 +439,8 @@ p
+h(3, "morph-rules") Morph rules
//- TODO: write morph rules section
+h(2, "testing") Testing the new language tokenizer
p
@ -533,8 +538,8 @@ p
| #[+src(gh("spacy-dev-resources", "training/word_freqs.py")) word_freqs.py]
| script from the spaCy developer resources. Note that your corpus should
| not be preprocessed (i.e. you need punctuation for example). The
| #[+a("/docs/usage/cli#model") #[code model] command] expects a
| tab-separated word frequencies file with three columns:
| #[+api("cli#model") #[code model]] command expects a tab-separated word
| frequencies file with three columns:
+list("numbers")
+item The number of times the word occurred in your language sample.
@ -626,37 +631,20 @@ p
| trains the model using #[+a("https://radimrehurek.com/gensim/") Gensim].
| The #[code vectors.bin] file should consist of one word and vector per line.
+h(2, "model-directory") Setting up a model directory
p
| Once you've collected the word frequencies, Brown clusters and word
| vectors files, you can use the
| #[+a("/docs/usage/cli#model") #[code model] command] to create a data
| directory:
+code(false, "bash").
python -m spacy model [lang] [model_dir] [freqs_data] [clusters_data] [vectors_data]
+aside-code("your_data_directory", "yaml").
├── vocab/
| ├── lexemes.bin # via nlp.vocab.dump(path)
| ├── strings.json # via nlp.vocab.strings.dump(file_)
| └── oov_prob # optional
├── pos/ # optional
| ├── model # via nlp.tagger.model.dump(path)
| └── config.json # via Langage.train
├── deps/ # optional
| ├── model # via nlp.parser.model.dump(path)
| └── config.json # via Langage.train
└── ner/ # optional
├── model # via nlp.entity.model.dump(path)
└── config.json # via Langage.train
p
| This creates a spaCy data directory with a vocabulary model, ready to be
| loaded. By default, the command expects to be able to find your language
| class using #[code spacy.util.get_lang_class(lang_id)].
| ├── lexemes.bin
| ├── strings.json
| └── oov_prob
├── pos/
| ├── model
| └── config.json
├── deps/
| ├── model
| └── config.json
└── ner/
├── model
└── config.json
+h(2, "train-tagger-parser") Training the tagger and parser
@ -666,13 +654,12 @@ p
| If your corpus uses the
| #[+a("http://universaldependencies.org/docs/format.html") CoNLL-U] format,
| i.e. files with the extension #[code .conllu], you can use the
| #[+a("/docs/usage/cli#convert") #[code convert] command] to convert it to
| spaCy's #[+a("/docs/api/annotation#json-input") JSON format] for training.
| #[+api("cli#convert") #[code convert]] command to convert it to spaCy's
| #[+a("/docs/api/annotation#json-input") JSON format] for training.
p
| Once you have your UD corpus transformed into JSON, you can train your
| model use the using spaCy's
| #[+a("/docs/usage/cli#train") #[code train] command]:
| model use the using spaCy's #[+api("cli#train") #[code train]] command:
+code(false, "bash").
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner]

View File

@ -1,38 +0,0 @@
//- 💫 DOCS > USAGE > CUSTOMIZING THE PIPELINE
include ../../_includes/_mixins
p
| spaCy provides several linguistic annotation functions by default. Each
| function takes a Doc object, and modifies it in-place. The default
| pipeline is #[code [nlp.tagger, nlp.entity, nlp.parser]]. spaCy 1.0
| introduced the ability to customise this pipeline with arbitrary
| functions.
+code.
def arbitrary_fixup_rules(doc):
for token in doc:
if token.text == u'bill' and token.tag_ == u'NNP':
token.tag_ = u'NN'
def custom_pipeline(nlp):
return (nlp.tagger, arbitrary_fixup_rules, nlp.parser, nlp.entity)
nlp = spacy.load('en', create_pipeline=custom_pipeline)
p
| The easiest way to customise the pipeline is to pass a
| #[code create_pipeline] callback to the #[code spacy.load()] function.
p
| The callback you pass to #[code create_pipeline] should take a single
| argument, and return a sequence of callables. Each callable in the
| sequence should accept a #[code Doc] object and modify it in place.
p
| Instead of passing a callback, you can also write to the
| #[code .pipeline] attribute directly.
+code.
nlp = spacy.load('en')
nlp.pipeline = [nlp.tagger]

View File

@ -11,18 +11,50 @@ p
| #[code spaces] booleans, which allow you to maintain alignment of the
| tokens into the original string.
+aside("See Also")
| If you haven't read up on spaCy's #[+a("data-model") data model] yet,
| you should probably have a look. The main point to keep in mind is that
| spaCy's #[code Doc] doesn't copy or refer to the original string. The
| string is reconstructed from the tokens when required.
+h(2, "101") Tokenizer 101
include _spacy-101/_tokenization
+h(3, "101-data") Tokenizer data
p
| #[strong Global] and #[strong language-specific] tokenizer data is
| supplied via the language data in #[+src(gh("spaCy", "spacy/lang")) spacy/lang].
| The tokenizer exceptions define special cases like "don't" in English,
| which needs to be split into two tokens: #[code {ORTH: "do"}] and
| #[code {ORTH: "n't", LEMMA: "not"}]. The prefixes, suffixes and infixes
| mosty define punctuation rules for example, when to split off periods
| (at the end of a sentence), and when to leave token containing periods
| intact (abbreviations like "U.S.").
+image
include ../../assets/img/docs/language_data.svg
.u-text-right
+button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic
+infobox
| For more details on the language-specific data, see the
| usage workflow on #[+a("/docs/usage/adding-languages") adding languages].
+h(2, "special-cases") Adding special case tokenization rules
p
| Most domains have at least some idiosyncracies that require custom
| tokenization rules. Here's how to add a special case rule to an existing
| tokenization rules. This could be very certain expressions, or
| abbreviations only used in this specific field.
+aside("Language data vs. custom tokenization")
| Tokenization rules that are specific to one language, but can be
| #[strong generalised across that language] should ideally live in the
| language data in #[+src(gh("spaCy", "spacy/lang")) spacy/lang]  we
| always appreciate pull requests! Anything that's specific to a domain or
| text type like financial trading abbreviations, or Bavarian youth slang
| should be added as a special case rule to your tokenizer instance. If
| you're dealing with a lot of customisations, it might make sense to create
| an entirely custom subclass.
p
| Here's how to add a special case rule to an existing
| #[+api("tokenizer") #[code Tokenizer]] instance:
+code.
@ -30,15 +62,12 @@ p
from spacy.symbols import ORTH, LEMMA, POS
nlp = spacy.load('en')
assert [w.text for w in nlp(u'gimme that')] == [u'gimme', u'that']
nlp.tokenizer.add_special_case(u'gimme',
[
{
ORTH: u'gim',
LEMMA: u'give',
POS: u'VERB'},
{
ORTH: u'me'}])
doc = nlp(u'gimme that') # phrase to tokenize
assert [w.text for w in doc] == [u'gimme', u'that'] # current tokenization
# add special case rule
special_case = [{ORTH: u'gim', LEMMA: u'give', POS: u'VERB'}, {ORTH: u'me'}]
nlp.tokenizer.add_special_case(u'gimme', special_case)
assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that']
assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'me', u'that']
@ -55,9 +84,8 @@ p
| The special case rules have precedence over the punctuation splitting:
+code.
nlp.tokenizer.add_special_case(u'...gimme...?',
[{
ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}])
special_case = [{ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}]
nlp.tokenizer.add_special_case(u'...gimme...?', special_case)
assert len(nlp(u'...gimme...?')) == 1
p
@ -137,8 +165,8 @@ p
+h(2, "native-tokenizers") Customizing spaCy's Tokenizer class
p
| Let's imagine you wanted to create a tokenizer for a new language. There
| are four things you would need to define:
| Let's imagine you wanted to create a tokenizer for a new language or
| specific domain. There are four things you would need to define:
+list("numbers")
+item
@ -170,14 +198,14 @@ p
import re
from spacy.tokenizer import Tokenizer
prefix_re = re.compile(r'''[\[\(&quot;']''')
suffix_re = re.compile(r'''[\]\)&quot;']''')
def create_tokenizer(nlp):
return Tokenizer(nlp.vocab,
prefix_search=prefix_re.search,
suffix_search=suffix_re.search)
prefix_re = re.compile(r'''[\[\(&quot;&apos;]''')
suffix_re = re.compile(r'''[\]\)&quot;&apos;]''')
nlp = spacy.load('en', tokenizer=create_make_doc)
def create_tokenizer(nlp):
return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
suffix_search=suffix_re.search)
nlp = spacy.load('en', tokenizer=create_tokenizer)
p
| If you need to subclass the tokenizer instead, the relevant methods to
@ -187,29 +215,68 @@ p
+h(2, "custom-tokenizer") Hooking an arbitrary tokenizer into the pipeline
p
| You can pass a custom tokenizer using the #[code make_doc] keyword, when
| you're creating the pipeline:
| The tokenizer is the first component of the processing pipeline and the
| only one that can't be replaced by writing to #[code nlp.pipeline]. This
| is because it has a different signature from all the other components:
| it takes a text and returns a #[code Doc], whereas all other components
| expect to already receive a tokenized #[code Doc].
+code.
import spacy
+image
include ../../assets/img/docs/pipeline.svg
.u-text-right
+button("/assets/img/docs/pipeline.svg", false, "secondary").u-text-tag View large graphic
nlp = spacy.load('en', make_doc=my_tokenizer)
p
| However, this approach often leaves us with a chicken-and-egg problem.
| To construct the tokenizer, we usually want attributes of the #[code nlp]
| pipeline. Specifically, we want the tokenizer to hold a reference to the
| pipeline's vocabulary object. Let's say we have the following class as
| our tokenizer:
| To overwrite the existing tokenizer, you need to replace
| #[code nlp.tokenizer] with a custom function that takes a text, and
| returns a #[code Doc].
+code.
nlp = spacy.load('en')
nlp.tokenizer = my_tokenizer
+table(["Argument", "Type", "Description"])
+row
+cell #[code text]
+cell unicode
+cell The raw text to tokenize.
+footrow
+cell returns
+cell #[code Doc]
+cell The tokenized document.
+infobox("Important note: using a custom tokenizer")
.o-block
| In spaCy v1.x, you had to add a custom tokenizer by passing it to the
| #[code make_doc] keyword argument, or by passing a tokenizer "factory"
| to #[code create_make_doc]. This was unnecessarily complicated. Since
| spaCy v2.0, you can simply write to #[code nlp.tokenizer]. If your
| tokenizer needs the vocab, you can write a function and use
| #[code nlp.vocab].
+code-new.
nlp.tokenizer = my_tokenizer
nlp.tokenizer = my_tokenizer_factory(nlp.vocab)
+code-old.
nlp = spacy.load('en', make_doc=my_tokenizer)
nlp = spacy.load('en', create_make_doc=my_tokenizer_factory)
+h(3, "custom-tokenizer-example") Example: A custom whitespace tokenizer
p
| To construct the tokenizer, we usually want attributes of the #[code nlp]
| pipeline. Specifically, we want the tokenizer to hold a reference to the
| vocabulary object. Let's say we have the following class as
| our tokenizer:
+code.
import spacy
from spacy.tokens import Doc
class WhitespaceTokenizer(object):
def __init__(self, nlp):
self.vocab = nlp.vocab
def __init__(self, vocab):
self.vocab = vocab
def __call__(self, text):
words = text.split(' ')
@ -218,28 +285,12 @@ p
return Doc(self.vocab, words=words, spaces=spaces)
p
| As you can see, we need a #[code vocab] instance to construct this — but
| we won't get the #[code vocab] instance until we get back the #[code nlp]
| object from #[code spacy.load()]. The simplest solution is to build the
| object in two steps:
| As you can see, we need a #[code Vocab] instance to construct this — but
| we won't have it until we get back the loaded #[code nlp] object. The
| simplest solution is to build the tokenizer in two steps. This also means
| that you can reuse the "tokenizer factory" and initialise it with
| different instances of #[code Vocab].
+code.
nlp = spacy.load('en')
nlp.make_doc = WhitespaceTokenizer(nlp)
p
| You can instead pass the class to the #[code create_make_doc] keyword,
| which is invoked as callback once the #[code nlp] object is ready:
+code.
nlp = spacy.load('en', create_make_doc=WhitespaceTokenizer)
p
| Finally, you can of course create your own subclasses, and create a bound
| #[code make_doc] method. The disadvantage of this approach is that spaCy
| uses inheritance to give each language-specific pipeline its own class.
| If you're working with multiple languages, a naive solution will
| therefore require one custom class per language you're working with.
| This might be at least annoying. You may be able to do something more
| generic by doing some clever magic with metaclasses or mixins, if that's
| the sort of thing you're into.
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)

View File

@ -1,264 +0,0 @@
//- 💫 DOCS > USAGE > SPACY'S DATA MODEL
include ../../_includes/_mixins
p After reading this page, you should be able to:
+list
+item Understand how spaCy's Doc, Span, Token and Lexeme object work
+item Start using spaCy's Cython API
+item Use spaCy more efficiently
+h(2, "architecture") Architecture
+image
include ../../assets/img/docs/architecture.svg
+h(2, "design-considerations") Design considerations
+h(3, "no-job-too-big") No job too big
p
| When writing spaCy, one of my mottos was #[em no job too big]. I wanted
| to make sure that if Google or Facebook were founded tomorrow, spaCy
| would be the obvious choice for them. I wanted spaCy to be the obvious
| choice for web-scale NLP. This meant sweating about performance, because
| for web-scale tasks, Moore's law can't save you.
p
| Most computational work gets less expensive over time. If you wrote a
| program to solve fluid dynamics in 2008, and you ran it again in 2014,
| you would expect it to be cheaper. For NLP, it often doesn't work out
| that way. The problem is that we're writing programs where the task is
| something like "Process all articles in the English Wikipedia". Sure,
| compute prices dropped from $0.80 per hour to $0.20 per hour on AWS in
| 2008-2014. But the size of Wikipedia grew from 3GB to 11GB. Maybe the
| job is a #[em little] cheaper in 2014 — but not by much.
+h(3, "annotation-layers") Multiple layers of annotation
p
| When I tell a certain sort of person that I'm a computational linguist,
| this comic is often the first thing that comes to their mind:
+image("http://i.imgur.com/n3DTzqx.png", 450)
+image-caption &copy; #[+a("http://xkcd.com") xkcd]
p
| I've thought a lot about what this comic is really trying to say. It's
| probably not talking about #[em data models] — but in that sense at
| least, it really rings true.
p
| You'll often need to model a document as a sequence of sentences. Other
| times you'll need to model it as a sequence of words. Sometimes you'll
| care about paragraphs, other times you won't. Sometimes you'll care
| about extracting quotes, which can cross paragraph boundaries. A quote
| can also occur within a sentence. When we consider sentence structure,
| things get even more complicated and contradictory. We have syntactic
| trees, sequences of entities, sequences of phrases, sub-word units,
| multi-word units...
p
| Different applications are going to need to query different,
| overlapping, and often contradictory views of the document. They're
| often going to need to query them jointly. You need to be able to get
| the syntactic head of a named entity, or the sentiment of a paragraph.
+h(2, "solutions") Solutions
+h(3) Fat types, thin tokens
+h(3) Static model, dynamic views
p
| Different applications are going to need to query different,
| overlapping, and often contradictory views of the document. For this
| reason, I think it's a bad idea to have too much of the document
| structure reflected in the data model. If you structure the data
| according to the needs of one layer of annotation, you're going to need
| to copy the data and transform it in order to use a different layer of
| annotation. You'll soon have lots of copies, and no single source of
| truth.
+h(3) Never go full stand-off
+h(3) Implementation
+h(3) Cython 101
+h(3) #[code cdef class Doc]
p
| Let's start at the top. Here's the memory layout of the
| #[+api("doc") #[code Doc]] class, minus irrelevant details:
+code.
from cymem.cymem cimport Pool
from ..vocab cimport Vocab
from ..structs cimport TokenC
cdef class Doc:
cdef Pool mem
cdef Vocab vocab
cdef TokenC* c
cdef int length
cdef int max_length
p
| So, our #[code Doc] class is a wrapper around a TokenC* array — that's
| where the actual document content is stored. Here's the #[code TokenC]
| struct, in its entirety:
+h(3) #[code cdef struct TokenC]
+code.
cdef struct TokenC:
const LexemeC* lex
uint64_t morph
univ_pos_t pos
bint spacy
int tag
int idx
int lemma
int sense
int head
int dep
bint sent_start
uint32_t l_kids
uint32_t r_kids
uint32_t l_edge
uint32_t r_edge
int ent_iob
int ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
hash_t ent_id
p
| The token owns all of its linguistic annotations, and holds a const
| pointer to a #[code LexemeC] struct. The #[code LexemeC] struct owns all
| of the #[em vocabulary] data about the word — all the dictionary
| definition stuff that we want to be shared by all instances of the type.
| Here's the #[code LexemeC] struct, in its entirety:
+h(3) #[code cdef struct LexemeC]
+code.
cdef struct LexemeC:
int32_t id
int32_t orth # Allows the string to be retrieved
int32_t length # Length of the string
uint64_t flags # These are the most useful parts.
int32_t cluster # Distributional similarity cluster
float prob # Probability
float sentiment # Slot for sentiment
int32_t lang
int32_t lower # These string views made sense
int32_t norm # when NLP meant linear models.
int32_t shape # Now they're less relevant, and
int32_t prefix # will probably be revised.
int32_t suffix
float* vector # &lt;-- This was a design mistake, and will change.
+h(2, "dynamic-views") Dynamic views
+h(3) Text
p
| You might have noticed that in all of the structs above, there's not a
| string to be found. The strings are all stored separately, in the
| #[+api("stringstore") #[code StringStore]] class. The lexemes don't know
| the strings — they only know their integer IDs. The document string is
| never stored anywhere, either. Instead, it's reconstructed by iterating
| over the tokens, which look up the #[code orth] attribute of their
| underlying lexeme. Once we have the orth ID, we can fetch the string
| from the vocabulary. Finally, each token knows whether a single
| whitespace character (#[code ' ']) should be used to separate it from
| the subsequent tokens. This allows us to preserve whitespace.
+code.
cdef print_text(Vocab vocab, const TokenC* tokens, int length):
for i in range(length):
word_string = vocab.strings[tokens.lex.orth]
if tokens.lex.spacy:
word_string += ' '
print(word_string)
p
| This is why you get whitespace tokens in spaCy — we need those tokens,
| so that we can reconstruct the document string. I also think you should
| have those tokens anyway. Most NLP libraries strip them, making it very
| difficult to recover the paragraph information once you're at the token
| level. You'll never have that sort of problem with spaCy — because
| there's a single source of truth.
+h(3) #[code cdef class Token]
p When you do...
+code.
doc[i]
p
| ...you get back an instance of class #[code spacy.tokens.token.Token].
| This instance owns no data. Instead, it holds the information
| #[code (doc, i)], and uses these to retrieve all information via the
| parent container.
+h(3) #[code cdef class Span]
p When you do...
+code.
doc[i : j]
p
| ...you get back an instance of class #[code spacy.tokens.span.Span].
| #[code Span] instances are also returned by the #[code .sents],
| #[code .ents] and #[code .noun_chunks] iterators of the #[code Doc]
| object. A #[code Span] is a slice of tokens, with an optional label
| attached. Its data model is:
+code.
cdef class Span:
cdef readonly Doc doc
cdef int start
cdef int end
cdef int start_char
cdef int end_char
cdef int label
p
| Once again, the #[code Span] owns almost no data. Instead, it refers
| back to the parent #[code Doc] container.
p
| The #[code start] and #[code end] attributes refer to token positions,
| while #[code start_char] and #[code end_char] record the character
| positions of the span. By recording the character offsets, we can still
| use the #[code Span] object if the tokenization of the document changes.
+h(3) #[code cdef class Lexeme]
p When you do...
+code.
vocab[u'the']
p
| ...you get back an instance of class #[code spacy.lexeme.Lexeme]. The
| #[code Lexeme]'s data model is:
+code.
cdef class Lexeme:
cdef LexemeC* c
cdef readonly Vocab vocab

View File

@ -6,57 +6,85 @@ p
| spaCy features a fast and accurate syntactic dependency parser, and has
| a rich API for navigating the tree. The parser also powers the sentence
| boundary detection, and lets you iterate over base noun phrases, or
| "chunks".
| "chunks". You can check whether a #[+api("doc") #[code Doc]] object has
| been parsed with the #[code doc.is_parsed] attribute, which returns a
| boolean value. If this attribute is #[code False], the default sentence
| iterator will raise an exception.
+aside-code("Example").
import spacy
+h(2, "noun-chunks") Noun chunks
+tag-model("dependency parse")
p
| Noun chunks are "base noun phrases" flat phrases that have a noun as
| their head. You can think of noun chunks as a noun plus the words describing
| the noun for example, "the lavish green grass" or "the worlds largest
| tech fund". To get the noun chunks in a document, simply iterate over
| #[+api("doc#noun_chunks") #[code Doc.noun_chunks]].
+code("Example").
nlp = spacy.load('en')
doc = nlp(u'I like green eggs and ham.')
for np in doc.noun_chunks:
print(np.text, np.root.text, np.root.dep_, np.root.head.text)
# I I nsubj like
# green eggs eggs dobj like
# ham ham conj eggs
doc = nlp(u'Autonomous cars shift insurance liability toward manufacturers')
for chunk in doc.noun_chunks:
print(chunk.text, chunk.root.text, chunk.root.dep_,
chunk.root.head.text)
p
| You can check whether a #[+api("doc") #[code Doc]] object has been
| parsed with the #[code doc.is_parsed] attribute, which returns a boolean
| value. If this attribute is #[code False], the default sentence iterator
| will raise an exception.
+aside
| #[strong Text:] The original noun chunk text.#[br]
| #[strong Root text:] The original text of the word connecting the noun
| chunk to the rest of the parse.#[br]
| #[strong Root dep:] Dependcy relation connecting the root to its head.#[br]
| #[strong Root head text:] The text of the root token's head.#[br]
+h(2, "displacy") The displaCy visualizer
p
| The best way to understand spaCy's dependency parser is interactively,
| through the #[+a(DEMOS_URL + "/displacy", true) displaCy visualizer]. If
| you want to know how to write rules that hook into some type of syntactic
| construction, just plug the sentence into the visualizer and see how
| spaCy annotates it.
+table(["Text", "root.text", "root.dep_", "root.head.text"])
- var style = [0, 0, 1, 0]
+annotation-row(["Autonomous cars", "cars", "nsubj", "shift"], style)
+annotation-row(["insurance liability", "liability", "dobj", "shift"], style)
+annotation-row(["manufacturers", "manufacturers", "pobj", "toward"], style)
+h(2, "navigating") Navigating the parse tree
p
| spaCy uses the terms #[em head] and #[em child] to describe the words
| connected by a single arc in the dependency tree. The term #[em dep] is
| used for the arc label, which describes the type of syntactic relation
| that connects the child to the head. As with other attributes, the value
| of #[code token.dep] is an integer. You can get the string value with
| #[code token.dep_].
| spaCy uses the terms #[strong head] and #[strong child] to describe the words
| #[strong connected by a single arc] in the dependency tree. The term
| #[strong dep] is used for the arc label, which describes the type of
| syntactic relation that connects the child to the head. As with other
| attributes, the value of #[code .dep] is an integer. You can get
| the string value with #[code .dep_].
+aside-code("Example").
from spacy.symbols import det
the, dog = nlp(u'the dog')
assert the.dep == det
assert the.dep_ == 'det'
+code("Example").
doc = nlp(u'Autonomous cars shift insurance liability toward manufacturers')
for token in doc:
print(token.text, token.dep_, token.head.text, token.head.pos_,
[child for child in token.children])
+aside
| #[strong Text]: The original token text.#[br]
| #[strong Dep]: The syntactic relation connecting child to head.#[br]
| #[strong Head text]: The original text of the token head.#[br]
| #[strong Head POS]: The part-of-speech tag of the token head.#[br]
| #[strong Children]: The immediate syntactic dependents of the token.
+table(["Text", "Dep", "Head text", "Head POS", "Children"])
- var style = [0, 1, 0, 1, 0]
+annotation-row(["Autonomous", "amod", "cars", "NOUN", ""], style)
+annotation-row(["cars", "nsubj", "shift", "VERB", "Autonomous"], style)
+annotation-row(["shift", "ROOT", "shift", "VERB", "cars, liability"], style)
+annotation-row(["insurance", "compound", "liability", "NOUN", ""], style)
+annotation-row(["liability", "dobj", "shift", "VERB", "insurance, toward"], style)
+annotation-row(["toward", "prep", "liability", "NOUN", "manufacturers"], style)
+annotation-row(["manufacturers", "pobj", "toward", "ADP", ""], style)
+codepen("dcf8d293367ca185b935ed2ca11ebedd", 370)
p
| Because the syntactic relations form a tree, every word has exactly one
| head. You can therefore iterate over the arcs in the tree by iterating
| over the words in the sentence. This is usually the best way to match an
| arc of interest — from below:
| Because the syntactic relations form a tree, every word has
| #[strong exactly one head]. You can therefore iterate over the arcs in
| the tree by iterating over the words in the sentence. This is usually
| the best way to match an arc of interest — from below:
+code.
from spacy.symbols import nsubj, VERB
# Finding a verb with a subject from below — good
verbs = set()
for possible_subject in doc:
@ -82,6 +110,8 @@ p
| attribute, which provides a sequence of #[+api("token") #[code Token]]
| objects.
+h(3, "navigating-around") Iterating around the local tree
p
| A few more convenience attributes are provided for iterating around the
| local tree from the token. The #[code .lefts] and #[code .rights]
@ -90,75 +120,118 @@ p
| two integer-typed attributes, #[code .n_rights] and #[code .n_lefts],
| that give the number of left and right children.
+aside-code("Examples").
apples = nlp(u'bright red apples on the tree')[2]
print([w.text for w in apples.lefts])
# ['bright', 'red']
print([w.text for w in apples.rights])
# ['on']
assert apples.n_lefts == 2
assert apples.n_rights == 1
from spacy.symbols import nsubj
doc = nlp(u'Credit and mortgage account holders must submit their requests within 30 days.')
root = [w for w in doc if w.head is w][0]
subject = list(root.lefts)[0]
for descendant in subject.subtree:
assert subject.is_ancestor_of(descendant)
from spacy.symbols import nsubj
doc = nlp(u'Credit and mortgage account holders must submit their requests.')
holders = doc[4]
span = doc[holders.left_edge.i : holders.right_edge.i + 1]
span.merge()
for word in doc:
print(word.text, word.pos_, word.dep_, word.head.text)
# Credit and mortgage account holders nsubj NOUN submit
# must VERB aux submit
# submit VERB ROOT submit
# their DET det requests
# requests NOUN dobj submit
+code.
doc = nlp(u'bright red apples on the tree')
assert [token.text for token in doc[2].lefts]) == [u'bright', u'red']
assert [token.text for token in doc[2].rights]) == ['on']
assert doc[2].n_lefts == 2
assert doc[2].n_rights == 1
p
| You can get a whole phrase by its syntactic head using the
| #[code .subtree] attribute. This returns an ordered sequence of tokens.
| For the default English model, the parse tree is #[em projective], which
| means that there are no crossing brackets. The tokens returned by
| #[code .subtree] are therefore guaranteed to be contiguous. This is not
| true for the German model, which has many
| #[+a("https://explosion.ai/blog/german-model#word-order", true) non-projective dependencies].
| You can walk up the tree with the #[code .ancestors] attribute, and
| check dominance with the #[code .is_ancestor()] method.
| check dominance with the #[+api("token#is_ancestor") #[code .is_ancestor()]]
| method.
+aside("Projective vs. non-projective")
| For the #[+a("/docs/usage/models#available") default English model], the
| parse tree is #[strong projective], which means that there are no crossing
| brackets. The tokens returned by #[code .subtree] are therefore guaranteed
| to be contiguous. This is not true for the German model, which has many
| #[+a(COMPANY_URL + "/blog/german-model#word-order", true) non-projective dependencies].
+code.
doc = nlp(u'Credit and mortgage account holders must submit their requests')
root = [token for token in doc if token.head is token][0]
subject = list(root.lefts)[0]
for descendant in subject.subtree:
assert subject.is_ancestor(descendant)
print(descendant.text, descendant.dep_, descendant.n_lefts, descendant.n_rights,
[ancestor.text for ancestor in descendant.ancestors])
+table(["Text", "Dep", "n_lefts", "n_rights", "ancestors"])
- var style = [0, 1, 1, 1, 0]
+annotation-row(["Credit", "nmod", 0, 2, "holders, submit"], style)
+annotation-row(["and", "cc", 0, 0, "Credit, holders, submit"], style)
+annotation-row(["mortgage", "compound", 0, 0, "account, Credit, holders, submit"], style)
+annotation-row(["account", "conj", 1, 0, "Credit, holders, submit"], style)
+annotation-row(["holders", "nsubj", 1, 0, "submit"], style)
p
| Finally, I often find the #[code .left_edge] and #[code right_edge]
| attributes especially useful. They give you the first and last token
| Finally, the #[code .left_edge] and #[code .right_edge] attributes
| can be especially useful, because they give you the first and last token
| of the subtree. This is the easiest way to create a #[code Span] object
| for a syntactic phrase — a useful operation.
| for a syntactic phrase. Note that #[code .right_edge] gives a token
| #[strong within] the subtree — so if you use it as the end-point of a
| range, don't forget to #[code +1]!
+code.
doc = nlp(u'Credit and mortgage account holders must submit their requests')
span = doc[doc[4].left_edge.i : doc[4].right_edge.i+1]
span.merge()
for token in doc:
print(token.text, token.pos_, token.dep_, token.head.text)
+table(["Text", "POS", "Dep", "Head text"])
- var style = [0, 1, 1, 0]
+annotation-row(["Credit and mortgage account holders", "NOUN", "nsubj", "submit"], style)
+annotation-row(["must", "VERB", "aux", "submit"], style)
+annotation-row(["submit", "VERB", "ROOT", "submit"], style)
+annotation-row(["their", "ADJ", "poss", "requests"], style)
+annotation-row(["requests", "NOUN", "dobj", "submit"], style)
+h(2, "displacy") Visualizing dependencies
p
| Note that #[code .right_edge] gives a token #[em within] the subtree —
| so if you use it as the end-point of a range, don't forget to #[code +1]!
| The best way to understand spaCy's dependency parser is interactively.
| To make this easier, spaCy v2.0+ comes with a visualization module. Simply
| pass a #[code Doc] or a list of #[code Doc] objects to
| displaCy and run #[+api("displacy#serve") #[code displacy.serve]] to
| run the web server, or #[+api("displacy#render") #[code displacy.render]]
| to generate the raw markup. If you want to know how to write rules that
| hook into some type of syntactic construction, just plug the sentence into
| the visualizer and see how spaCy annotates it.
+code.
from spacy import displacy
doc = nlp(u'Autonomous cars shift insurance liability toward manufacturers')
displacy.serve(doc, style='dep')
+infobox
| For more details and examples, see the
| #[+a("/docs/usage/visualizers") usage workflow on visualizing spaCy]. You
| can also test displaCy in our #[+a(DEMOS_URL + "/displacy", true) online demo].
+h(2, "disabling") Disabling the parser
p
| The parser is loaded and enabled by default. If you don't need any of
| the syntactic information, you should disable the parser. Disabling the
| parser will make spaCy load and run much faster. Here's how to prevent
| the parser from being loaded:
| In the #[+a("/docs/usage/models/available") default models], the parser
| is loaded and enabled as part of the
| #[+a("docs/usage/language-processing-pipelines") standard processing pipeline].
| If you don't need any of the syntactic information, you should disable
| the parser. Disabling the parser will make spaCy load and run much faster.
| If you want to load the parser, but need to disable it for specific
| documents, you can also control its use on the #[code nlp] object.
+code.
import spacy
nlp = spacy.load('en', disable=['parser'])
nlp = English().from_disk('/model', disable=['parser'])
doc = nlp(u"I don't want parsed", disable=['parser'])
nlp = spacy.load('en', parser=False)
p
| If you need to load the parser, but need to disable it for specific
| documents, you can control its use with the #[code parse] keyword
| argument:
+code.
nlp = spacy.load('en')
doc1 = nlp(u'Text I do want parsed.')
doc2 = nlp(u"Text I don't want parsed", parse=False)
+infobox("Important note: disabling pipeline components")
.o-block
| Since spaCy v2.0 comes with better support for customising the
| processing pipeline components, the #[code parser] keyword argument
| has been replaced with #[code disable], which takes a list of
| #[+a("/docs/usage/language-processing-pipeline") pipeline component names].
| This lets you disable both default and custom components when loading
| a model, or initialising a Language class via
| #[+api("language-from_disk") #[code from_disk]].
+code-new.
nlp = spacy.load('en', disable=['parser'])
doc = nlp(u"I don't want parsed", disable=['parser'])
+code-old.
nlp = spacy.load('en', parser=False)
doc = nlp(u"I don't want parsed", parse=False)

View File

@ -9,14 +9,12 @@ p
| locations, organizations and products. You can add arbitrary classes to
| the entity recognition system, and update the model with new examples.
+aside-code("Example").
import spacy
nlp = spacy.load('en')
doc = nlp(u'London is a big city in the United Kingdom.')
for ent in doc.ents:
print(ent.label_, ent.text)
# GPE London
# GPE United Kingdom
+h(2, "101") Named Entity Recognition 101
+tag-model("named entities")
include _spacy-101/_named-entities
+h(2, "accessing") Accessing entity annotations
p
| The standard way to access entity annotations is the
@ -26,56 +24,89 @@ p
| #[code ent.label] and #[code ent.label_]. The #[code Span] object acts
| as a sequence of tokens, so you can iterate over the entity or index into
| it. You can also get the text form of the whole entity, as though it were
| a single token. See the #[+api("span") API reference] for more details.
| a single token.
p
| You can access token entity annotations using the #[code token.ent_iob]
| and #[code token.ent_type] attributes. The #[code token.ent_iob]
| attribute indicates whether an entity starts, continues or ends on the
| tag (In, Begin, Out).
| You can also access token entity annotations using the
| #[+api("token#attributes") #[code token.ent_iob]] and
| #[+api("token#attributes") #[code token.ent_type]] attributes.
| #[code token.ent_iob] indicates whether an entity starts, continues or
| ends on the tag. If no entity type is set on a token, it will return an
| empty string.
+aside("IOB Scheme")
| #[code I] Token is inside an entity.#[br]
| #[code O] Token is outside an entity.#[br]
| #[code B] Token is the beginning of an entity.#[br]
+code("Example").
doc = nlp(u'London is a big city in the United Kingdom.')
print(doc[0].text, doc[0].ent_iob, doc[0].ent_type_)
# (u'London', 2, u'GPE')
print(doc[1].text, doc[1].ent_iob, doc[1].ent_type_)
# (u'is', 3, u'')
doc = nlp(u'San Francisco considers banning sidewalk delivery robots')
# document level
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
assert ents == [(u'San Francisco', 0, 13, u'GPE')]
# token level
ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_]
ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_]
assert ent_san == [u'San', u'B', u'GPE']
assert ent_francisco == [u'Francisco', u'I', u'GPE']
+table(["Text", "ent_iob", "ent_iob_", "ent_type", "ent_type_", "Description"])
- var style = [0, 1, 1, 1, 1, 0]
+annotation-row(["San", 3, "B", 381, "GPE", "beginning of an entity"], style)
+annotation-row(["Francisco", 1, "I", 381, "GPE", "inside an entity"], style)
+annotation-row(["considers", 2, "O", 0, '""', "outside an entity"], style)
+annotation-row(["banning", 2, "O", 0, '""', "outside an entity"], style)
+annotation-row(["sidewalk", 2, "O", 0, '""', "outside an entity"], style)
+annotation-row(["delivery", 2, "O", 0, '""', "outside an entity"], style)
+annotation-row(["robots", 2, "O", 0, '""', "outside an entity"], style)
+h(2, "setting") Setting entity annotations
p
| To ensure that the sequence of token annotations remains consistent, you
| have to set entity annotations at the document level — you can't write
| directly to the #[code token.ent_iob] or #[code token.ent_type]
| attributes. The easiest way to set entities is to assign to the
| #[code doc.ents] attribute.
| have to set entity annotations #[strong at the document level]. However,
| you can't write directly to the #[code token.ent_iob] or
| #[code token.ent_type] attributes, so the easiest way to set entities is
| to assign to the #[+api("doc#ents") #[code doc.ents]] attribute
| and create the new entity as a #[+api("span") #[code Span]].
+code("Example").
doc = nlp(u'London is a big city in the United Kingdom.')
doc.ents = []
assert doc[0].ent_type_ == ''
doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings['GPE'])]
assert doc[0].ent_type_ == 'GPE'
doc.ents = []
doc.ents = [(u'LondonCity', doc.vocab.strings['GPE'], 0, 1)]
from spacy.tokens import Span
doc = nlp(u'Netflix is hiring a new VP of global policy')
# the model didn't recognise any entities :(
ORG = doc.vocab.strings[u'ORG'] # get integer ID of entity label
netflix_ent = Span(doc, 0, 1, label=ORG) # create a Span for the new entity
doc.ents = [netflix_ent]
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
assert ents = [(u'Netflix', 0, 7, u'ORG')]
p
| The value you assign should be a sequence, the values of which
| can either be #[code Span] objects, or #[code (ent_id, ent_type, start, end)]
| tuples, where #[code start] and #[code end] are token offsets that
| describe the slice of the document that should be annotated.
| Keep in mind that you need to create a #[code Span] with the start and
| end index of the #[strong token], not the start and end index of the
| entity in the document. In this case, "Netflix" is token #[code (0, 1)]
| but at the document level, the entity will have the start and end
| indices #[code (0, 7)].
+h(3, "setting-from-array") Setting entity annotations from array
p
| You can also assign entity annotations using the #[code doc.from_array()]
| method. To do this, you should include both the #[code ENT_TYPE] and the
| #[code ENT_IOB] attributes in the array you're importing from.
| You can also assign entity annotations using the
| #[+api("doc#from_array") #[code doc.from_array()]] method. To do this,
| you should include both the #[code ENT_TYPE] and the #[code ENT_IOB]
| attributes in the array you're importing from.
+code("Example").
from spacy.attrs import ENT_IOB, ENT_TYPE
+code.
import numpy
from spacy.attrs import ENT_IOB, ENT_TYPE
doc = nlp.make_doc(u'London is a big city in the United Kingdom.')
assert list(doc.ents) == []
header = [ENT_IOB, ENT_TYPE]
attr_array = numpy.zeros((len(doc), len(header)))
attr_array[0, 0] = 2 # B
@ -83,12 +114,14 @@ p
doc.from_array(header, attr_array)
assert list(doc.ents)[0].text == u'London'
+h(3, "setting-cython") Setting entity annotations in Cython
p
| Finally, you can always write to the underlying struct, if you compile
| a Cython function. This is easy to do, and allows you to write efficient
| native code.
| a #[+a("http://cython.org/") Cython] function. This is easy to do, and
| allows you to write efficient native code.
+code("Example").
+code.
# cython: infer_types=True
from spacy.tokens.doc cimport Doc
@ -104,67 +137,30 @@ p
| you'll have responsibility for ensuring that the data is left in a
| consistent state.
+h(2, "displacy") Visualizing named entities
p
| The #[+a(DEMOS_URL + "/displacy-ent/") displaCy #[sup ENT] visualizer]
| lets you explore an entity recognition model's behaviour interactively.
| If you're training a model, it's very useful to run the visualization
| yourself. To help you do that, spaCy v2.0+ comes with a visualization
| module. Simply pass a #[code Doc] or a list of #[code Doc] objects to
| displaCy and run #[+api("displacy#serve") #[code displacy.serve]] to
| run the web server, or #[+api("displacy#render") #[code displacy.render]]
| to generate the raw markup.
p
| For more details and examples, see the
| #[+a("/docs/usage/visualizers") usage workflow on visualizing spaCy].
+code("Named Entity example").
import spacy
from spacy import displacy
text = """But Google is starting from behind. The company made a late push
into hardware, and Apples Siri, available on iPhones, and Amazons Alexa
software, which runs on its Echo and Dot devices, have clear leads in
consumer adoption."""
nlp = spacy.load('custom_ner_model')
doc = nlp(text)
displacy.serve(doc, style='ent')
+codepen("a73f8b68f9af3157855962b283b364e4", 345)
+h(2, "entity-types") Built-in entity types
include ../api/_annotation/_named-entities
+aside("Tip: Understanding entity types")
| You can also use #[code spacy.explain()] to get the description for the
| string representation of an entity label. For example,
| #[code spacy.explain("LANGUAGE")] will return "any named language".
+aside("Install")
| The #[+api("load") #[code spacy.load()]] function configures a pipeline that
| includes all of the available annotators for the given ID. In the example
| above, the #[code 'en'] ID tells spaCy to load the default English
| pipeline. If you have installed the data with
| #[code python -m spacy download en], this will include the entity
| recognition model.
include ../api/_annotation/_named-entities
+h(2, "updating") Training and updating
p
| To provide training examples to the entity recogniser, you'll first need
| to create an instance of the #[code GoldParse] class. You can specify
| your annotations in a stand-off format or as token tags.
| to create an instance of the #[+api("goldparse") #[code GoldParse]] class.
| You can specify your annotations in a stand-off format or as token tags.
+code.
import spacy
import random
import spacy
from spacy.gold import GoldParse
from spacy.language import EntityRecognizer
from spacy.pipeline import EntityRecognizer
train_data = [
('Who is Chaka Khan?', [(7, 17, 'PERSON')]),
('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])
]
train_data = [('Who is Chaka Khan?', [(7, 17, 'PERSON')]),
('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])]
nlp = spacy.load('en', entity=False, parser=False)
ner = EntityRecognizer(nlp.vocab, entity_types=['PERSON', 'LOC'])
@ -237,3 +233,34 @@ p
| loss, via the #[+a("http://www.aclweb.org/anthology/C12-1059") dynamic oracle]
| imitation learning strategy. The transition system is equivalent to the
| BILOU tagging scheme.
+h(2, "displacy") Visualizing named entities
p
| The #[+a(DEMOS_URL + "/displacy-ent/") displaCy #[sup ENT] visualizer]
| lets you explore an entity recognition model's behaviour interactively.
| If you're training a model, it's very useful to run the visualization
| yourself. To help you do that, spaCy v2.0+ comes with a visualization
| module. Simply pass a #[code Doc] or a list of #[code Doc] objects to
| displaCy and run #[+api("displacy#serve") #[code displacy.serve]] to
| run the web server, or #[+api("displacy#render") #[code displacy.render]]
| to generate the raw markup.
p
| For more details and examples, see the
| #[+a("/docs/usage/visualizers") usage workflow on visualizing spaCy].
+code("Named Entity example").
import spacy
from spacy import displacy
text = """But Google is starting from behind. The company made a late push
into hardware, and Apples Siri, available on iPhones, and Amazons Alexa
software, which runs on its Echo and Dot devices, have clear leads in
consumer adoption."""
nlp = spacy.load('custom_ner_model')
doc = nlp(text)
displacy.serve(doc, style='ent')
+codepen("a73f8b68f9af3157855962b283b364e4", 345)

View File

@ -175,6 +175,136 @@ p
+cell Python 3.5+
+cell Visual Studio 2015
+h(2, "troubleshooting") Troubleshooting guide
p
| This section collects some of the most common errors you may come
| across when installing, loading and using spaCy, as well as their solutions.
+aside("Help us improve this guide")
| Did you come across a problem like the ones listed here and want to
| share the solution? You can find the "Suggest edits" button at the
| bottom of this page that points you to the source. We always
| appreciate #[+a(gh("spaCy") + "/pulls") pull requests]!
+h(3, "compatible-model") No compatible model found
+code(false, "text").
No compatible model found for [lang] (spaCy v#{SPACY_VERSION}).
p
| This usually means that the model you're trying to download does not
| exist, or isn't available for your version of spaCy. Check the
| #[+a(gh("spacy-models", "compatibility.json")) compatibility table]
| to see which models are available for your spaCy version. If you're using
| an old version, consider upgrading to the latest release. Note that while
| spaCy supports tokenization for
| #[+a("/docs/api/language-models/#alpha-support") a variety of languages],
| not all of them come with statistical models. To only use the tokenizer,
| import the language's #[code Language] class instead, for example
| #[code from spacy.fr import French].
+h(3, "symlink-privilege") Symbolic link privilege not held
+code(false, "text").
OSError: symbolic link privilege not held
p
| To create #[+a("/docs/usage/models/#usage") shortcut links] that let you
| load models by name, spaCy creates a symbolic link in the
| #[code spacy/data] directory. This means your user needs permission to do
| this. The above error mostly occurs when doing a system-wide installation,
| which will create the symlinks in a system directory. Run the
| #[code download] or #[code link] command as administrator, or use a
| #[code virtualenv] to install spaCy in a user directory, instead
| of doing a system-wide installation.
+h(3, "no-cache-dir") No such option: --no-cache-dir
+code(false, "text").
no such option: --no-cache-dir
p
| The #[code download] command uses pip to install the models and sets the
| #[code --no-cache-dir] flag to prevent it from requiring too much memory.
| #[+a("https://pip.pypa.io/en/stable/reference/pip_install/#caching") This setting]
| requires pip v6.0 or newer. Run #[code pip install -U pip] to upgrade to
| the latest version of pip. To see which version you have installed,
| run #[code pip --version].
+h(3, "import-error") Import error
+code(false, "text").
Import Error: No module named spacy
p
| This error means that the spaCy module can't be located on your system, or in
| your environment. Make sure you have spaCy installed. If you're using a
| #[code virtualenv], make sure it's activated and check that spaCy is
| installed in that environment otherwise, you're trying to load a system
| installation. You can also run #[code which python] to find out where
| your Python executable is located.
+h(3, "import-error-models") Import error: models
+code(false, "text").
ImportError: No module named 'en_core_web_sm'
p
| As of spaCy v1.7, all models can be installed as Python packages. This means
| that they'll become importable modules of your application. When creating
| #[+a("/docs/usage/models/#usage") shortcut links], spaCy will also try
| to import the model to load its meta data. If this fails, it's usually a
| sign that the package is not installed in the current environment.
| Run #[code pip list] or #[code pip freeze] to check which model packages
| you have installed, and install the
| #[+a("/docs/usage/models#available") correct models] if necessary. If you're
| importing a model manually at the top of a file, make sure to use the name
| of the package, not the shortcut link you've created.
+h(3, "vocab-strings") File not found: vocab/strings.json
+code(false, "text").
FileNotFoundError: No such file or directory: [...]/vocab/strings.json
p
| This error may occur when using #[code spacy.load()] to load
| a language model either because you haven't set up a
| #[+a("/docs/usage/models/#usage") shortcut link] for it, or because it
| doesn't actually exist. Set up a
| #[+a("/docs/usage/models/#usage") shortcut link] for the model
| you want to load. This can either be an installed model package, or a
| local directory containing the model data. If you want to use one of the
| #[+a("/docs/api/language-models/#alpha-support") alpha tokenizers] for
| languages that don't yet have a statistical model, you should import its
| #[code Language] class instead, for example
| #[code from spacy.lang.bn import Bengali].
+h(3, "command-not-found") Command not found
+code(false, "text").
command not found: spacy
p
| This error may occur when running the #[code spacy] command from the
| command line. spaCy does not currently add an entry to our #[code PATH]
| environment variable, as this can lead to unexpected results, especially
| when using #[code virtualenv]. Run the command with #[code python -m],
| for example #[code python -m spacy download en]. For more info on this,
| see #[+api("cli#download") download].
+h(3, "module-load") 'module' object has no attribute 'load'
+code(false, "text").
AttributeError: 'module' object has no attribute 'load'
p
| While this could technically have many causes, including spaCy being
| broken, the most likely one is that your script's file or directory name
| is "shadowing" the module e.g. your file is called #[code spacy.py],
| or a directory you're importing from is called #[code spacy]. So, when
| using spaCy, never call anything else #[code spacy].
+h(2, "tests") Run tests
p

View File

@ -2,127 +2,355 @@
include ../../_includes/_mixins
p
| The standard entry point into spaCy is the #[code spacy.load()]
| function, which constructs a language processing pipeline. The standard
| variable name for the language processing pipeline is #[code nlp], for
| Natural Language Processing. The #[code nlp] variable is usually an
| instance of class #[code spacy.language.Language]. For English, the
| #[code spacy.en.English] class is the default.
+h(2, "101") Pipelines 101
include _spacy-101/_pipelines
+h(2, "pipelines") How pipelines work
p
| You'll use the nlp instance to produce #[+api("doc") #[code Doc]]
| objects. You'll then use the #[code Doc] object to access linguistic
| annotations to help you with whatever text processing task you're
| trying to do.
| spaCy makes it very easy to create your own pipelines consisting of
| reusable components this includes spaCy's default vectorizer, tagger,
| parser and entity regcognizer, but also your own custom processing
| functions. A pipeline component can be added to an already existing
| #[code nlp] object, specified when initialising a #[code Language] class,
| or defined within a
| #[+a("/docs/usage/saving-loading#models-generating") model package].
p
| When you load a model, spaCy first consults the model's
| #[+a("/docs/usage/saving-loading#models-generating") meta.json] for its
| #[code setup] details. This typically includes the ID of a language class,
| and an optional list of pipeline components. spaCy then does the
| following:
+aside-code("meta.json (excerpt)", "json").
{
"name": "example_model",
"description": "Example model for spaCy",
"setup": {
"lang": "en",
"pipeline": ["token_vectors", "tagger"]
}
}
+list("numbers")
+item
| Look up #[strong pipeline IDs] in the available
| #[strong pipeline factories].
+item
| Initialise the #[strong pipeline components] by calling their
| factories with the #[code Vocab] as an argument. This gives each
| factory and component access to the pipeline's shared data, like
| strings, morphology and annotation scheme.
+item
| Load the #[strong language class and data] for the given ID via
| #[+api("util.get_lang_class") #[code get_lang_class]].
+item
| Pass the path to the #[strong model data] to the #[code Language]
| class and return it.
p
| So when you call this...
+code.
import spacy # See "Installing spaCy"
nlp = spacy.load('en') # You are here.
doc = nlp(u'Hello, spacy!') # See "Using the pipeline"
print((w.text, w.pos_) for w in doc) # See "Doc, Span and Token"
+aside("Why do we have to preload?")
| Loading the models takes ~200x longer than
| processing a document. We therefore want to amortize the start-up cost
| across multiple invocations. It's often best to wrap the pipeline as a
| singleton. The library avoids doing that for you, because it's a
| difficult design to back out of.
p The #[code load] function takes the following positional arguments:
+table([ "Name", "Description" ])
+row
+cell #[code lang_id]
+cell
| An ID that is resolved to a class or factory function by
| #[code spacy.util.get_lang_class()]. Common values are
| #[code 'en'] for the English pipeline, or #[code 'de'] for the
| German pipeline. You can register your own factory function or
| class with #[code spacy.util.set_lang_class()].
nlp = spacy.load('en')
p
| All keyword arguments are passed forward to the pipeline factory. No
| keyword arguments are required. The built-in factories (e.g.
| #[code spacy.en.English], #[code spacy.de.German]), which are subclasses
| of #[+api("language") #[code Language]], respond to the following
| keyword arguments:
| ... the model tells spaCy to use the pipeline
| #[code ["vectorizer", "tagger", "parser", "ner"]]. spaCy will then look
| up each string in its internal factories registry and initialise the
| individual components. It'll then load #[code spacy.lang.en.English],
| pass it the path to the model's data directory, and return it for you
| to use as the #[code nlp] object.
+table([ "Name", "Description"])
p
| When you call #[code nlp] on a text, spaCy will #[strong tokenize] it and
| then #[strong call each component] on the #[code Doc], in order.
| Components all return the modified document, which is then processed by
| the component next in the pipeline.
+code("The pipeline under the hood").
doc = nlp.make_doc(u'This is a sentence')
for proc in nlp.pipeline:
doc = proc(doc)
+h(2, "creating") Creating pipeline components and factories
p
| spaCy lets you customise the pipeline with your own components. Components
| are functions that receive a #[code Doc] object, modify and return it.
| If your component is stateful, you'll want to create a new one for each
| pipeline. You can do that by defining and registering a factory which
| receives the shared #[code Vocab] object and returns a component.
+h(3, "creating-component") Creating a component
p
| A component receives a #[code Doc] object and
| #[strong performs the actual processing] for example, using the current
| weights to make a prediction and set some annotation on the document. By
| adding a component to the pipeline, you'll get access to the #[code Doc]
| at any point #[strong during] processing instead of only being able to
| modify it afterwards.
+aside-code("Example").
def my_component(doc):
# do something to the doc here
return doc
+table(["Argument", "Type", "Description"])
+row
+cell #[code path]
+cell
| Where to load the data from. If None, the default data path is
| fetched via #[code spacy.util.get_data_path()]. You can
| configure this default using #[code spacy.util.set_data_path()].
| The data path is expected to be either a string, or an object
| responding to the #[code pathlib.Path] interface. If the path is
| a string, it will be immediately transformed into a
| #[code pathlib.Path] object. spaCy promises to never manipulate
| or open file-system paths as strings. All access to the
| file-system is done via the #[code pathlib.Path] interface.
| spaCy also promises to never check the type of path objects.
| This allows you to customize the loading behaviours in arbitrary
| ways, by creating your own object that implements the
| #[code pathlib.Path] interface.
+cell #[code doc]
+cell #[code Doc]
+cell The #[code Doc] object processed by the previous component.
+row
+cell #[code pipeline]
+cell
| A sequence of functions that take the Doc object and modify it
| in-place. See
| #[+a("customizing-pipeline") Customizing the pipeline].
+footrow
+cell returns
+cell #[code Doc]
+cell The #[code Doc] object processed by this pipeline component.
+row
+cell #[code create_pipeline]
+cell
| Callback to construct the pipeline sequence. It should accept
| the #[code nlp] instance as its only argument, and return a
| sequence of functions that take the #[code Doc] object and
| modify it in-place.
| See #[+a("customizing-pipeline") Customizing the pipeline]. If
| a value is supplied to the pipeline keyword argument, the
| #[code create_pipeline] keyword argument is ignored.
p
| When creating a new #[code Language] class, you can pass it a list of
| pipeline component functions to execute in that order. You can also
| add it to an existing pipeline by modifying #[code nlp.pipeline] just
| be careful not to overwrite a pipeline or its components by accident!
+row
+cell #[code make_doc]
+cell A function that takes the input and returns a document object.
+code.
# Create a new Language object with a pipeline
from spacy.language import Language
nlp = Language(pipeline=[my_component])
+row
+cell #[code create_make_doc]
+cell
| Callback to construct the #[code make_doc] function. It should
| accept the #[code nlp] instance as its only argument. To use the
| built-in annotation processes, it should return an object of
| type #[code Doc]. If a value is supplied to the #[code make_doc]
| keyword argument, the #[code create_make_doc] keyword argument
| is ignored.
# Modify an existing pipeline
nlp = spacy.load('en')
nlp.pipeline.append(my_component)
+h(3, "creating-factory") Creating a factory
p
| A factory is a #[strong function that returns a pipeline component].
| It's called with the #[code Vocab] object, to give it access to the
| shared data between components for example, the strings, morphology,
| vectors or annotation scheme. Factories are useful for creating
| #[strong stateful components], especially ones which
| #[strong depend on shared data].
+aside-code("Example").
def my_factory(vocab):
# load some state
def my_component(doc):
# process the doc
return doc
return my_component
+table(["Argument", "Type", "Description"])
+row
+cell #[code vocab]
+cell Supply a pre-built Vocab instance, instead of constructing one.
+row
+cell #[code add_vectors]
+cell #[coce Vocab]
+cell
| Callback that installs word vectors into the Vocab instance. The
| #[code add_vectors] callback should take a
| #[+api("vocab") #[code Vocab]] instance as its only argument,
| and set the word vectors and #[code vectors_length] in-place. See
| #[+a("word-vectors-similarities") Word Vectors and Similarities].
| Shared data between components, including strings, morphology,
| vectors etc.
+row
+cell #[code tagger]
+cell Supply a pre-built tagger, instead of creating one.
+footrow
+cell returns
+cell callable
+cell The pipeline component.
+row
+cell #[code parser]
+cell Supply a pre-built parser, instead of creating one.
p
| By creating a factory, you're essentially telling spaCy how to get the
| pipeline component #[strong once the vocab is available]. Factories need to
| be registered via #[+api("spacy#set_factory") #[code set_factory()]] and
| by assigning them a unique ID. This ID can be added to the pipeline as a
| string. When creating a pipeline, you're free to mix strings and
| callable components:
+row
+cell #[code entity]
+cell Supply a pre-built entity recognizer, instead of creating one.
+code.
spacy.set_factory('my_factory', my_factory)
nlp = Language(pipeline=['my_factory', my_other_component])
+row
+cell #[code matcher]
+cell Supply a pre-built matcher, instead of creating one.
p
| If spaCy comes across a string in the pipeline, it will try to resolve it
| by looking it up in the available factories. The factory will then be
| initialised with the #[code Vocab]. Providing factory names instead of
| callables also makes it easy to specify them in the model's
| #[+a("/docs/usage/saving-loading#models-generating") meta.json]. If you're
| training your own model and want to use one of spaCy's default components,
| you won't have to worry about finding and implementing it either to use
| the default tagger, simply add #[code "tagger"] to the pipeline, and
| #[strong spaCy will know what to do].
+infobox("Important note")
| Because factories are #[strong resolved on initialisation] of the
| #[code Language] class, it's #[strong not possible] to add them to the
| pipeline afterwards, e.g. by modifying #[code nlp.pipeline]. This only
| works with individual component functions. To use factories, you need to
| create a new #[code Language] object, or generate a
| #[+a("/docs/usage/saving-loading#models-generating") model package] with
| a custom pipeline.
+h(2, "example1") Example: Custom sentence segmentation logic
+aside("Real-world examples")
| To see real-world examples of pipeline factories and components in action,
| you can have a look at the source of spaCy's built-in components, e.g.
| the #[+src(gh("spacy")) tagger], #[+src(gh("spacy")) parser] or
| #[+src(gh("spacy")) entity recognizer].
p
| Let's say you want to implement custom logic to improve spaCy's sentence
| boundary detection. Currently, sentence segmentation is based on the
| dependency parse, which doesn't always produce ideal results. The custom
| logic should therefore be applied #[strong after] tokenization, but
| #[strong before] the dependency parsing this way, the parser can also
| take advantage of the sentence boundaries.
+code.
def sbd_component(doc):
for i, token in enumerate(doc[:-2]):
# define sentence start if period + titlecase token
if token.text == '.' and doc[i+1].is_title:
doc[i+1].sent_start = True
return doc
p
| In this case, we simply want to add the component to the existing
| pipeline of the English model. We can do this by inserting it at index 0
| of #[code nlp.pipeline]:
+code.
nlp = spacy.load('en')
nlp.pipeline.insert(0, sbd_component)
p
| When you call #[code nlp] on some text, spaCy will tokenize it to create
| a #[code Doc] object, and first call #[code sbd_component] on it, followed
| by the model's default pipeline.
+h(2, "example2") Example: Sentiment model
p
| Let's say you have trained your own document sentiment model on English
| text. After tokenization, you want spaCy to first execute the
| #[strong default vectorizer], followed by a custom
| #[strong sentiment component] that adds a #[code .sentiment]
| property to the #[code Doc], containing your model's sentiment precition.
p
| Your component class will have a #[code from_disk()] method that spaCy
| calls to load the model data. When called, the component will compute
| the sentiment score, add it to the #[code Doc] and return the modified
| document. Optionally, the component can include an #[code update()] method
| to allow training the model.
+code.
import pickle
from pathlib import Path
class SentimentComponent(object):
def __init__(self, vocab):
self.weights = None
def __call__(self, doc):
doc.sentiment = sum(self.weights*doc.vector) # set sentiment property
return doc
def from_disk(self, path): # path = model path + factory ID ('sentiment')
self.weights = pickle.load(Path(path) / 'weights.bin') # load weights
return self
def update(self, doc, gold): # update weights allows training!
prediction = sum(self.weights*doc.vector)
self.weights -= 0.001*doc.vector*(prediction-gold.sentiment)
p
| The factory will initialise the component with the #[code Vocab] object.
| To be able to add it to your model's pipeline as #[code 'sentiment'],
| it also needs to be registered via
| #[+api("spacy#set_factory") #[code set_factory()]].
+code.
def sentiment_factory(vocab):
component = SentimentComponent(vocab) # initialise component
return component
spacy.set_factory('sentiment', sentiment_factory)
p
| The above code should be #[strong shipped with your model]. You can use
| the #[+api("cli#package") #[code package]] command to create all required
| files and directories. The model package will include an
| #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py]
| with a #[code load()] method, that will initialise the language class with
| the model's pipeline and call the #[code from_disk()] method to load
| the model data.
p
| In the model package's meta.json, specify the language class and pipeline
| IDs in #[code setup]:
+code("meta.json (excerpt)", "json").
{
"name": "my_sentiment_model",
"version": "1.0.0",
"spacy_version": "&gt;=2.0.0,&lt;3.0.0",
"setup": {
"lang": "en",
"pipeline": ["vectorizer", "sentiment"]
}
}
p
| When you load your new model, spaCy will call the model's #[code load()]
| method. This will return a #[code Language] object with a pipeline
| containing the default vectorizer, and the sentiment component returned
| by your custom #[code "sentiment"] factory.
+code.
nlp = spacy.load('my_sentiment_model')
doc = nlp(u'I love pizza')
assert doc.sentiment
+infobox("Saving and loading models")
| For more information and a detailed guide on how to package your model,
| see the documentation on
| #[+a("/docs/usage/saving-loading#models") saving and loading models].
+h(2, "disabling") Disabling pipeline components
p
| If you don't need a particular component of the pipeline for
| example, the tagger or the parser, you can disable loading it. This can
| sometimes make a big difference and improve loading speed. Disabled
| component names can be provided to #[code spacy.load], #[code from_disk]
| or the #[code nlp] object itself as a list:
+code.
nlp = spacy.load('en', disable['parser', 'tagger'])
nlp = English().from_disk('/model', disable=['vectorizer', 'ner'])
doc = nlp(u"I don't want parsed", disable=['parser'])
p
| Note that you can't write directly to #[code nlp.pipeline], as this list
| holds the #[em actual components], not the IDs. However, if you know the
| order of the components, you can still slice the list:
+code.
nlp = spacy.load('en')
nlp.pipeline = nlp.pipeline[:2] # only use the first two components
+infobox("Important note: disabling pipeline components")
.o-block
| Since spaCy v2.0 comes with better support for customising the
| processing pipeline components, the #[code parser], #[code tagger]
| and #[code entity] keyword arguments have been replaced with
| #[code disable], which takes a list of pipeline component names.
| This lets you disable both default and custom components when loading
| a model, or initialising a Language class via
| #[+api("language-from_disk") #[code from_disk]].
+code-new.
nlp = spacy.load('en', disable=['tagger', 'ner'])
doc = nlp(u"I don't want parsed", disable=['parser'])
+code-old.
nlp = spacy.load('en', tagger=False, entity=False)
doc = nlp(u"I don't want parsed", parse=False)

View File

@ -6,23 +6,156 @@ p
| The following examples and code snippets give you an overview of spaCy's
| functionality and its usage.
+h(2, "models") Install and load models
+h(2, "models") Install models and process text
+code(false, "bash").
python -m spacy download en
python -m spacy download de
+code.
import spacy
nlp = spacy.load('en')
doc = nlp(u'Hello, world. Here are two sentences.')
+h(2, "examples-resources") Load resources and process text
nlp_de = spacy.load('de')
doc_de = nlp_de(u'Ich bin ein Berliner.')
+infobox
| #[strong API:] #[+api("spacy#load") #[code spacy.load()]]
| #[strong Usage:] #[+a("/docs/usage/models") Models],
| #[+a("/docs/usage/spacy-101") spaCy 101]
+h(2, "examples-tokens-sentences") Get tokens, noun chunks & sentences
+tag-model("dependency parse")
+code.
doc = nlp(u"Peach emoji is where it has always been. Peach is the superior "
u"emoji. It's outranking eggplant 🍑 ")
assert doc[0].text == u'Peach'
assert doc[1].text == u'emoji'
assert doc[-1].text == u'🍑'
assert doc[17:19] == u'outranking eggplant'
assert doc.noun_chunks[0].text == u'Peach emoji'
sentences = list(doc.sents)
assert len(sentences) == 3
assert sentences[0].text == u'Peach is the superior emoji.'
+infobox
| #[strong API:] #[+api("doc") #[code Doc]], #[+api("token") #[code Token]]
| #[strong Usage:] #[+a("/docs/usage/spacy-101") spaCy 101]
+h(2, "examples-pos-tags") Get part-of-speech tags and flags
+tag-model("tagger")
+code.
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
apple = doc[0]
assert [apple.pos_, apple.pos] == [u'PROPN', 94]
assert [apple.tag_, apple.tag] == [u'NNP', 475]
assert [apple.shape_, apple.shape] == [u'Xxxxx', 684]
assert apple.is_alpha == True
assert apple.is_punct == False
billion = doc[10]
assert billion.is_digit == False
assert billion.like_num == True
assert billion.like_email == False
+infobox
| #[strong API:] #[+api("token") #[code Token]]
| #[strong Usage:] #[+a("/docs/usage/pos-tagging") Part-of-speech tagging]
+h(2, "examples-integer-ids") Use integer IDs for any string
+code.
hello_id = nlp.vocab.strings['Hello']
hello_str = nlp.vocab.strings[hello_id]
assert token.text == hello_id == 3125
assert token.text == hello_str == 'Hello'
+h(2, "examples-entities") Recongnise and update named entities
+tag-model("NER")
+code.
doc = nlp(u'San Francisco considers banning sidewalk delivery robots')
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
assert ents == [(u'San Francisco', 0, 13, u'GPE')]
from spacy.tokens import Span
doc = nlp(u'Netflix is hiring a new VP of global policy')
doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])]
ents = [(e.start_char, e.end_char, e.label_) for ent in doc.ents]
assert ents == [(0, 7, u'ORG')]
+infobox
| #[strong Usage:] #[+a("/docs/usage/entity-recognition") Named entity recognition]
+h(2, "displacy") Visualize a dependency parse and named entities in your browser
+tag-model("dependency parse", "NER")
+code.
from spacy import displacy
doc_dep = nlp(u'This is a sentence.')
displacy.serve(doc_dep, style='dep')
doc_ent = nlp(u'When Sebastian Thrun started working on self-driving cars at Google '
u'in 2007, few people outside of the company took him seriously.')
displacy.serve(doc_ent, style='ent')
+infobox
| #[strong API:] #[+api("displacy") #[code displacy]]
| #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizers]
+h(2, "examples-word-vectors") Get word vectors and similarity
+tag-model("word vectors")
+code.
doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.")
apple = doc[0]
banana = doc[2]
pasta = doc[6]
hippo = doc[8]
assert apple.similarity(banana) > pasta.similarity(hippo)
assert apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector
+infobox
| #[strong Usage:] #[+a("/docs/usage/word-vectors-similarities") Word vectors and similarity]
+h(2, "examples-serialization") Simple and efficient serialization
+code.
import spacy
en_nlp = spacy.load('en')
de_nlp = spacy.load('de')
en_doc = en_nlp(u'Hello, world. Here are two sentences.')
de_doc = de_nlp(u'ich bin ein Berliner.')
from spacy.tokens.doc import Doc
nlp = spacy.load('en')
moby_dick = open('moby_dick.txt', 'r')
doc = nlp(moby_dick)
doc.to_disk('/moby_dick.bin')
new_doc = Doc().from_disk('/moby_dick.bin')
+infobox
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
+h(2, "rule-matcher") Match text with token rules
+code.
import spacy
from spacy.matcher import Matcher
nlp = spacy.load('en')
matcher = Matcher(nlp.vocab)
# match "Google I/O" or "Google i/o"
pattern = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
matcher.add('GoogleIO', None, pattern)
matches = nlp(LOTS_OF TEXT)
+infobox
| #[strong API:] #[+api("matcher") #[code Matcher]]
| #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching]
+h(2, "multi-threaded") Multi-threaded generator
@ -35,37 +168,25 @@ p
if i == 100:
break
+h(2, "examples-tokens-sentences") Get tokens and sentences
+infobox
| #[strong API:] #[+api("doc") #[code Doc]]
| #[strong Usage:] #[+a("/docs/usage/production-usage") Production usage]
+h(2, "examples-dependencies") Get syntactic dependencies
+tag-model("dependency parse")
+code.
token = doc[0]
sentence = next(doc.sents)
assert token is sentence[0]
assert sentence.text == 'Hello, world.'
def dependency_labels_to_root(token):
"""Walk up the syntactic tree, collecting the arc labels."""
dep_labels = []
while token.head is not token:
dep_labels.append(token.dep)
token = token.head
return dep_labels
+h(2, "examples-integer-ids") Use integer IDs for any string
+code.
hello_id = nlp.vocab.strings['Hello']
hello_str = nlp.vocab.strings[hello_id]
assert token.orth == hello_id == 3125
assert token.orth_ == hello_str == 'Hello'
+h(2, "examples-string-views-flags") Get and set string views and flags
+code.
assert token.shape_ == 'Xxxxx'
for lexeme in nlp.vocab:
if lexeme.is_alpha:
lexeme.shape_ = 'W'
elif lexeme.is_digit:
lexeme.shape_ = 'D'
elif lexeme.is_punct:
lexeme.shape_ = 'P'
else:
lexeme.shape_ = 'M'
assert token.shape_ == 'W'
+infobox
| #[strong API:] #[+api("token") #[code Token]]
| #[strong Usage:] #[+a("/docs/usage/dependency-parse") Using the dependency parse]
+h(2, "examples-numpy-arrays") Export to numpy arrays
@ -80,107 +201,25 @@ p
assert doc[0].like_url == doc_array[0, 1]
assert list(doc_array[:, 1]) == [t.like_url for t in doc]
+h(2, "examples-word-vectors") Word vectors
+code.
doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
apples = doc[0]
oranges = doc[2]
boots = doc[6]
hippos = doc[8]
assert apples.similarity(oranges) > boots.similarity(hippos)
+h(2, "examples-pos-tags") Part-of-speech tags
+code.
from spacy.parts_of_speech import ADV
def is_adverb(token):
return token.pos == spacy.parts_of_speech.ADV
# These are data-specific, so no constants are provided. You have to look
# up the IDs from the StringStore.
NNS = nlp.vocab.strings['NNS']
NNPS = nlp.vocab.strings['NNPS']
def is_plural_noun(token):
return token.tag == NNS or token.tag == NNPS
def print_coarse_pos(token):
print(token.pos_)
def print_fine_pos(token):
print(token.tag_)
+h(2, "examples-dependencies") Syntactic dependencies
+code.
def dependency_labels_to_root(token):
'''Walk up the syntactic tree, collecting the arc labels.'''
dep_labels = []
while token.head is not token:
dep_labels.append(token.dep)
token = token.head
return dep_labels
+h(2, "examples-entities") Named entities
+code.
def iter_products(docs):
for doc in docs:
for ent in doc.ents:
if ent.label_ == 'PRODUCT':
yield ent
def word_is_in_entity(word):
return word.ent_type != 0
def count_parent_verb_by_person(docs):
counts = defaultdict(lambda: defaultdict(int))
for doc in docs:
for ent in doc.ents:
if ent.label_ == 'PERSON' and ent.root.head.pos == VERB:
counts[ent.orth_][ent.root.head.lemma_] += 1
return counts
+h(2, "examples-inline") Calculate inline mark-up on original string
+h(2, "examples-inline") Calculate inline markup on original string
+code.
def put_spans_around_tokens(doc, get_classes):
'''Given some function to compute class names, put each token in a
span element, with the appropriate classes computed.
All whitespace is preserved, outside of the spans. (Yes, I know HTML
won't display it. But the point is no information is lost, so you can
calculate what you need, e.g. <br /> tags, <p> tags, etc.)
'''
"""Given some function to compute class names, put each token in a
span element, with the appropriate classes computed. All whitespace is
preserved, outside of the spans. (Of course, HTML won't display more than
one whitespace character it but the point is, no information is lost
and you can calculate what you need, e.g. &lt;br /&gt;, &lt;p&gt; etc.)
"""
output = []
template = '<span classes="{classes}">{word}</span>{space}'
html = '&lt;span class="{classes}"&gt;{word}&lt;/span&gt;{space}'
for token in doc:
if token.is_space:
output.append(token.orth_)
output.append(token.text)
else:
output.append(
template.format(
classes=' '.join(get_classes(token)),
word=token.orth_,
space=token.whitespace_))
classes = ' '.join(get_classes(token))
output.append(html.format(classes=classes, word=token.text, space=token.whitespace_))
string = ''.join(output)
string = string.replace('\n', '')
string = string.replace('\t', ' ')
return string
+h(2, "examples-binary") Efficient binary serialization
+code.
import spacy
from spacy.tokens.doc import Doc
byte_string = doc.to_bytes()
open('moby_dick.bin', 'wb').write(byte_string)
nlp = spacy.load('en')
for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')):
doc = Doc(nlp.vocab)
doc.from_bytes(byte_string)

View File

@ -195,7 +195,7 @@ p
| privileges, the #[code spacy link] command may fail. The easiest solution
| is to re-run the command as admin, or use a #[code virtualenv]. For more
| info on this, see the
| #[+a("/docs/usage/troubleshooting#symlink-privilege") troubleshooting guide].
| #[+a("/docs/usage/#symlink-privilege") troubleshooting guide].
+h(3, "usage-import") Importing models as modules
@ -233,4 +233,4 @@ p
+infobox("Saving and loading models")
| For more information and a detailed guide on how to package your model,
| see the documentation on
| #[+a("/docs/usage/saving-loading") saving and loading models].
| #[+a("/docs/usage/saving-loading#models") saving and loading models].

View File

@ -7,22 +7,12 @@ p
| assigned to each token in the document. They're useful in rule-based
| processes. They can also be useful features in some statistical models.
p
| To use spaCy's tagger, you need to have a data pack installed that
| includes a tagging model. Tagging models are included in the data
| downloads for English and German. After you load the model, the tagger
| is applied automatically, as part of the default pipeline. You can then
| access the tags using the #[+api("token") #[code Token.tag]] and
| #[+api("token") #[code token.pos]] attributes. For English, the tagger
| also triggers some simple rule-based morphological processing, which
| gives you the lemma as well.
+h(2, "101") Part-of-speech tagging 101
+tag-model("tagger", "dependency parse")
+code("Usage").
import spacy
nlp = spacy.load('en')
doc = nlp(u'They told us to duck.')
for word in doc:
print(word.text, word.lemma, word.lemma_, word.tag, word.tag_, word.pos, word.pos_)
include _spacy-101/_pos-deps
+aside("Help spaCy's output is wrong!")
+h(2, "rule-based-morphology") Rule-based morphology
@ -63,7 +53,8 @@ p
+list("numbers")
+item
| The tokenizer consults a #[strong mapping table]
| The tokenizer consults a
| #[+a("/docs/usage/adding-languages#tokenizer-exceptions") mapping table]
| #[code TOKENIZER_EXCEPTIONS], which allows sequences of characters
| to be mapped to multiple tokens. Each token may be assigned a part
| of speech and one or more morphological features.
@ -77,8 +68,9 @@ p
+item
| For words whose POS is not set by a prior process, a
| #[strong mapping table] #[code TAG_MAP] maps the tags to a
| part-of-speech and a set of morphological features.
| #[+a("/docs/usage/adding-languages#tag-map") mapping table]
| #[code TAG_MAP] maps the tags to a part-of-speech and a set of
| morphological features.
+item
| Finally, a #[strong rule-based deterministic lemmatizer] maps the

View File

@ -2,74 +2,12 @@
include ../../_includes/_mixins
p
| Once you have loaded the #[code nlp] object, you can call it as though
| it were a function. This allows you to process a single unicode string.
+code.
doc = nlp(u'Hello, world! A three sentence document.\nWith new lines...')
p
| The library should perform equally well with short or long documents.
| All algorithms are linear-time in the length of the string, and once the
| data is loaded, there's no significant start-up cost to consider. This
| means that you don't have to strategically merge or split your text —
| you should feel free to feed in either single tweets or whole novels.
p
| If you run #[code nlp = spacy.load('en')], the #[code nlp] object will
| be an instance of #[code spacy.en.English]. This means that when you run
| #[code doc = nlp(text)], you're executing
| #[code spacy.en.English.__call__], which is implemented on its parent
| class, #[+api("language") #[code Language]].
+code.
doc = nlp.make_doc(text)
for proc in nlp.pipeline:
proc(doc)
p
| I've tried to make sure that the #[code Language.__call__] function
| doesn't do any "heavy lifting", so that you won't have complicated logic
| to replicate if you need to make your own pipeline class. This is all it
| does.
p
| The #[code .make_doc()] method and #[code .pipeline] attribute make it
| easier to customise spaCy's behaviour. If you're using the default
| pipeline, we can desugar one more time.
+code.
doc = nlp.tokenizer(text)
nlp.tagger(doc)
nlp.parser(doc)
nlp.entity(doc)
p Finally, here's where you can find out about each of those components:
+table(["Name", "Source"])
+row
+cell #[code tokenizer]
+cell #[+src(gh("spacy", "spacy/tokenizer.pyx")) spacy.tokenizer.Tokenizer]
+row
+cell #[code tagger]
+cell #[+src(gh("spacy", "spacy/tagger.pyx")) spacy.pipeline.Tagger]
+row
+cell #[code parser]
+cell #[+src(gh("spacy", "spacy/syntax/parser.pyx")) spacy.pipeline.DependencyParser]
+row
+cell #[code entity]
+cell #[+src(gh("spacy", "spacy/syntax/parser.pyx")) spacy.pipeline.EntityRecognizer]
+h(2, "multithreading") Multi-threading with #[code .pipe()]
p
| If you have a sequence of documents to process, you should use the
| #[+api("language#pipe") #[code .pipe()]] method. The #[code .pipe()]
| method takes an iterator of texts, and accumulates an internal buffer,
| #[+api("language#pipe") #[code .pipe()]] method. The method takes an
| iterator of texts, and accumulates an internal buffer,
| which it works on in parallel. It then yields the documents in order,
| one-by-one. After a long and bitter struggle, the global interpreter
| lock was freed around spaCy's main parsing loop in v0.100.3. This means

View File

@ -1,118 +0,0 @@
//- 💫 DOCS > USAGE > RESOURCES
include ../../_includes/_mixins
p Many of the associated tools and resources that we're developing alongside spaCy can be found in their own repositories.
+h(2, "developer") Developer tools
+table(["Name", "Description"])
+row
+cell
+src(gh("spacy-models")) spaCy Models
+cell
| Model releases for spaCy.
+row
+cell
+src(gh("spacy-dev-resources")) spaCy Dev Resources
+cell
| Scripts, tools and resources for developing spaCy, adding new
| languages and training new models.
+row
+cell
+src("spacy-benchmarks") spaCy Benchmarks
+cell
| Runtime performance comparison of spaCy against other NLP
| libraries.
+row
+cell
+src(gh("spacy-services")) spaCy Services
+cell
| REST microservices for spaCy demos and visualisers.
+row
+cell
+src(gh("spacy-notebooks")) spaCy Notebooks
+cell
| Jupyter notebooks for spaCy examples and tutorials.
+h(2, "libraries") Libraries and projects
+table(["Name", "Description"])
+row
+cell
+src(gh("sense2vec")) sense2vec
+cell
| Use spaCy to go beyond vanilla
| #[+a("https://en.wikipedia.org/wiki/Word2vec") Word2vec].
+h(2, "utility") Utility libraries and dependencies
+table(["Name", "Description"])
+row
+cell
+src(gh("thinc")) Thinc
+cell
| spaCy's Machine Learning library for NLP in Python.
+row
+cell
+src(gh("cymem")) Cymem
+cell
| Gate Cython calls to malloc/free behind Python ref-counted
| objects.
+row
+cell
+src(gh("preshed")) Preshed
+cell
| Cython hash tables that assume keys are pre-hashed
+row
+cell
+src(gh("murmurhash")) MurmurHash
+cell
| Cython bindings for
| #[+a("https://en.wikipedia.org/wiki/MurmurHash") MurmurHash2].
+h(2, "visualizers") Visualisers and demos
+table(["Name", "Description"])
+row
+cell
+src(gh("displacy")) displaCy.js
+cell
| A lightweight dependency visualisation library for the modern
| web, built with JavaScript, CSS and SVG.
| #[+a(DEMOS_URL + "/displacy") Demo here].
+row
+cell
+src(gh("displacy-ent")) displaCy#[sup ENT]
+cell
| A lightweight and modern named entity visualisation library
| built with JavaScript and CSS.
| #[+a(DEMOS_URL + "/displacy-ent") Demo here].
+row
+cell
+src(gh("sense2vec-demo")) sense2vec Demo
+cell
| Source of our Semantic Analysis of the Reddit Hivemind
| #[+a(DEMOS_URL + "/sense2vec") demo] using
| #[+a(gh("sense2vec")) sense2vec].

View File

@ -1,45 +1,89 @@
include ../../_includes/_mixins
+h(2, "101") Serialization 101
include _spacy-101/_serialization
+infobox("Important note")
| In spaCy v2.0, the API for saving and loading has changed to only use the
| four methods listed above consistently across objects and classes. For an
| overview of the changes, see #[+a("/docs/usage/v2#incompat") this table]
| and the notes on #[+a("/docs/usage/v2#migrating-saving-loading") migrating].
+h(3, "example-doc") Example: Saving and loading a document
p
| For simplicity, let's assume you've
| #[+a("/docs/usage/entity-recognition#setting") added custom entities] to
| a #[code Doc], either manually, or by using a
| #[+a("/docs/usage/rule-based-matching#on_match") match pattern]. You can
| save it locally by calling #[+api("doc#to_disk") #[code Doc.to_disk()]],
| and load it again via #[+api("doc#from_disk") #[code Doc.from_disk()]].
| This will overwrite the existing object and return it.
+code.
import spacy
from spacy.tokens import Span
text = u'Netflix is hiring a new VP of global policy'
nlp = spacy.load('en')
doc = nlp(text)
assert len(doc.ents) == 0 # Doc has no entities
doc.ents += ((Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])) # add entity
doc.to_disk('/path/to/doc') # save Doc to disk
new_doc = nlp(text)
assert len(new_doc.ents) == 0 # new Doc has no entities
new_doc = new_doc.from_disk('path/to/doc') # load from disk and overwrite
assert len(new_doc.ents) == 1 # entity is now recognised!
assert [(ent.text, ent.label_) for ent in new_doc.ents] == [(u'Netflix', u'ORG')]
+h(2, "models") Saving models
p
| After training your model, you'll usually want to save its state, and load
| it back later. You can do this with the
| #[+api("language#save_to_directory") #[code Language.save_to_directory()]]
| #[+api("language#to_disk") #[code Language.to_disk()]]
| method:
+code.
nlp.save_to_directory('/home/me/data/en_example_model')
nlp.to_disk('/home/me/data/en_example_model')
p
| The directory will be created if it doesn't exist, and the whole pipeline
| will be written out. To make the model more convenient to deploy, we
| recommend wrapping it as a Python package.
+h(2, "generating") Generating a model package
+h(3, "models-generating") Generating a model package
+infobox("Important note")
| The model packages are #[strong not suitable] for the public
| #[+a("https://pypi.python.org") pypi.python.org] directory, which is not
| designed for binary data and files over 50 MB. However, if your company
| is running an internal installation of pypi, publishing your models on
| there can be a convenient solution to share them with your team.
| is running an #[strong internal installation] of PyPi, publishing your
| models on there can be a convenient way to share them with your team.
p
| spaCy comes with a handy CLI command that will create all required files,
| and walk you through generating the meta data. You can also create the
| meta.json manually and place it in the model data directory, or supply a
| path to it using the #[code --meta] flag. For more info on this, see the
| #[+a("/docs/usage/cli/#package") #[code package] command] documentation.
| path to it using the #[code --meta] flag. For more info on this, see
| the #[+api("cli#package") #[code package]] docs.
+aside-code("meta.json", "json").
{
"name": "example_model",
"lang": "en",
"version": "1.0.0",
"spacy_version": "&gt;=1.7.0,&lt;2.0.0",
"spacy_version": "&gt;=2.0.0,&lt;3.0.0",
"description": "Example model for spaCy",
"author": "You",
"email": "you@example.com",
"license": "CC BY-SA 3.0"
"license": "CC BY-SA 3.0",
"setup": {
"lang": "en",
"pipeline": ["token_vectors", "tagger"]
}
}
+code(false, "bash").
@ -58,52 +102,112 @@ p This command will create a model package directory that should look like this:
p
| You can also find templates for all files in our
| #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources].
| #[+src(gh("spacy-dev-resources", "templates/model")) spaCy dev resources].
| If you're creating the package manually, keep in mind that the directories
| need to be named according to the naming conventions of
| #[code [language]_[name]] and #[code [language]_[name]-[version]]. The
| #[code lang] setting in the meta.json is also used to create the
| respective #[code Language] class in spaCy, which will later be returned
| by the model's #[code load()] method.
| #[code lang_name] and #[code lang_name-version].
+h(2, "building") Building a model package
+h(3, "models-custom") Customising the model setup
p
| The meta.json includes a #[code setup] key that lets you customise how
| the model should be initialised and loaded. You can define the language
| data to be loaded and the
| #[+a("/docs/usage/language-processing-pipeline") processing pipeline] to
| execute.
+table(["Setting", "Type", "Description"])
+row
+cell #[code lang]
+cell unicode
+cell ID of the language class to initialise.
+row
+cell #[code pipeline]
+cell list
+cell
| A list of strings mapping to the IDs of pipeline factories to
| apply in that order. If not set, spaCy's
| #[+a("/docs/usage/language-processing/pipelines") default pipeline]
| will be used.
p
| The #[code load()] method that comes with our model package
| templates will take care of putting all this together and returning a
| #[code Language] object with the loaded pipeline and data. If your model
| requires custom pipeline components, you should
| #[strong ship then with your model] and register their
| #[+a("/docs/usage/language-processing-pipeline#creating-factory") factories]
| via #[+api("spacy#set_factory") #[code set_factory()]].
+aside-code("Factory example").
def my_factory(vocab):
# load some state
def my_component(doc):
# process the doc
return doc
return my_component
+code.
spacy.set_factory('custom_component', custom_component_factory)
+infobox("Custom models with pipeline components")
| For more details and an example of how to package a sentiment model
| with a custom pipeline component, see the usage workflow on
| #[+a("/docs/usage/language-processing-pipeline#example2") language processing pipelines].
+h(3, "models-building") Building the model package
p
| To build the package, run the following command from within the
| directory. This will create a #[code .tar.gz] archive in a directory
| #[code /dist].
| directory. For more information on building Python packages, see the
| docs on Python's
| #[+a("https://setuptools.readthedocs.io/en/latest/") Setuptools].
+code(false, "bash").
python setup.py sdist
p
| For more information on building Python packages, see the
| #[+a("https://setuptools.readthedocs.io/en/latest/") Python Setuptools documentation].
+h(2, "loading") Loading a model package
p
| Model packages can be installed by pointing pip to the model's
| #[code .tar.gz] archive:
| This will create a #[code .tar.gz] archive in a directory #[code /dist].
| The model can be installed by pointing pip to the path of the archive:
+code(false, "bash").
pip install /path/to/en_example_model-1.0.0.tar.gz
p You'll then be able to load the model as follows:
p
| You can then load the model via its name, #[code en_example_model], or
| import it directly as a module and then call its #[code load()] method.
+code.
import en_example_model
nlp = en_example_model.load()
+h(2, "loading") Loading a custom model package
p
| To load the model via #[code spacy.load()], you can also
| create a #[+a("/docs/usage/models#usage") shortcut link] that maps the
| package name to a custom model name of your choice:
+code(false, "bash").
python -m spacy link en_example_model example
| To load a model from a data directory, you can use
| #[+api("spacy#load") #[code spacy.load()]] with the local path. This will
| look for a meta.json in the directory and use the #[code setup] details
| to initialise a #[code Language] class with a processing pipeline and
| load in the model data.
+code.
import spacy
nlp = spacy.load('example')
nlp = spacy.load('/path/to/model')
p
| If you want to #[strong load only the binary data], you'll have to create
| a #[code Language] class and call
| #[+api("language#from_disk") #[code from_disk]] instead.
+code.
from spacy.lang.en import English
nlp = English().from_disk('/path/to/data')
+infobox("Important note: Loading data in v2.x")
.o-block
| In spaCy 1.x, the distinction between #[code spacy.load()] and the
| #[code Language] class constructor was quite unclear. You could call
| #[code spacy.load()] when no model was present, and it would silently
| return an empty object. Likewise, you could pass a path to
| #[code English], even if the mode required a different language.
| spaCy v2.0 solves this with a clear distinction between setting up
| the instance and loading the data.
+code-new nlp = English().from_disk('/path/to/data')
+code-old nlp = spacy.load('en', path='/path/to/data')

View File

@ -2,9 +2,210 @@
include ../../_includes/_mixins
+h(2, "features") Features
+aside
| If one of spaCy's functionalities #[strong needs a model], it means that
| you need to have one our the available
| #[+a("/docs/usage/models") statistical models] installed. Models are used
| to #[strong predict] linguistic annotations for example, if a word is
| a verb or a noun.
+table(["Name", "Description", "Needs model"])
+row
+cell #[strong Tokenization]
+cell
+cell #[+procon("con")]
+row
+cell #[strong Part-of-speech Tagging]
+cell
+cell #[+procon("pro")]
+row
+cell #[strong Dependency Parsing]
+cell
+cell #[+procon("pro")]
+row
+cell #[strong Sentence Boundary Detection]
+cell
+cell #[+procon("pro")]
+row
+cell #[strong Named Entity Recongition] (NER)
+cell
+cell #[+procon("pro")]
+row
+cell #[strong Rule-based Matching]
+cell
+cell #[+procon("con")]
+row
+cell #[strong Similarity]
+cell
+cell #[+procon("pro")]
+row
+cell #[strong Training]
+cell
+cell #[+procon("neutral")]
+row
+cell #[strong Serialization]
+cell
+cell #[+procon("neutral")]
+h(2, "annotations") Linguistic annotations
p
| spaCy provides a variety of linguistic annotations to give you insights
| into a text's grammatical structure. This includes the word types,
| i.e. the parts of speech, and how the words are related to each other.
| For example, if you're analysing text, it makes a #[em huge] difference
| whether a noun is the subject of a sentence, or the object or whether
| "google" is used as a verb, or refers to the website or company in a
| specific context.
p
| Once you've downloaded and installed a #[+a("/docs/usage/models") model],
| you can load it via #[+api("spacy#load") #[code spacy.load()]]. This will
| return a #[code Language] object contaning all components and data needed
| to process text. We usually call it #[code nlp]. Calling the #[code nlp]
| object on a string of text will return a processed #[code Doc]:
+code.
import spacy
nlp = spacy.load('en')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
p
| Even though a #[code Doc] is processed e.g. split into individual words
| and annotated it still holds #[strong all information of the original text],
| like whitespace characters. This way, you'll never lose any information
| when processing text with spaCy.
+h(3, "annotations-token") Tokenization
include _spacy-101/_tokenization
+h(3, "annotations-pos-deps") Part-of-speech tags and dependencies
+tag-model("dependency parse")
include _spacy-101/_pos-deps
+h(3, "annotations-ner") Named Entities
+tag-model("named entities")
include _spacy-101/_named-entities
+h(2, "vectors-similarity") Word vectors and similarity
+tag-model("vectors")
include _spacy-101/_similarity
include _spacy-101/_word-vectors
+h(2, "pipelines") Pipelines
include _spacy-101/_pipelines
+h(2, "serialization") Serialization
include _spacy-101/_serialization
+h(2, "training") Training
include _spacy-101/_training
+h(2, "architecture") Architecture
+image
include ../../assets/img/docs/architecture.svg
.u-text-right
+button("/assets/img/docs/architecture.svg", false, "secondary").u-text-tag View large graphic
+table(["Name", "Description"])
+row
+cell #[+api("language") #[code Language]]
+cell
| A text-processing pipeline. Usually you'll load this once per
| process as #[code nlp] and pass the instance around your application.
+row
+cell #[+api("doc") #[code Doc]]
+cell A container for accessing linguistic annotations.
+row
+cell #[+api("span") #[code Span]]
+cell A slice from a #[code Doc] object.
+row
+cell #[+api("token") #[code Token]]
+cell
| An individual token — i.e. a word, punctuation symbol, whitespace,
| etc.
+row
+cell #[+api("lexeme") #[code Lexeme]]
+cell
| An entry in the vocabulary. It's a word type with no context, as
| opposed to a word token. It therefore has no part-of-speech tag,
| dependency parse etc.
+row
+cell #[+api("vocab") #[code Vocab]]
+cell
| A lookup table for the vocabulary that allows you to access
| #[code Lexeme] objects.
+row
+cell #[code Morphology]
+cell
+row
+cell #[+api("stringstore") #[code StringStore]]
+cell Map strings to and from integer IDs.
+row
+row
+cell #[+api("tokenizer") #[code Tokenizer]]
+cell
| Segment text, and create #[code Doc] objects with the discovered
| segment boundaries.
+row
+cell #[+api("tagger") #[code Tagger]]
+cell Annotate part-of-speech tags on #[code Doc] objects.
+row
+cell #[+api("dependencyparser") #[code DependencyParser]]
+cell Annotate syntactic dependencies on #[code Doc] objects.
+row
+cell #[+api("entityrecognizer") #[code EntityRecognizer]]
+cell
| Annotate named entities, e.g. persons or products, on #[code Doc]
| objects.
+row
+cell #[+api("matcher") #[code Matcher]]
+cell
| Match sequences of tokens, based on pattern rules, similar to
| regular expressions.
+h(3, "architecture-other") Other
+table(["Name", "Description"])
+row
+cell #[+api("goldparse") #[code GoldParse]]
+cell Collection for training annotations.
+row
+cell #[+api("goldcorpus") #[code GoldCorpus]]
+cell
| An annotated corpus, using the JSON file format. Manages
| annotations for tagging, dependency parsing and NER.

View File

@ -77,8 +77,8 @@ p
p
| To make the model more convenient to deploy, we recommend wrapping it as
| a Python package, so that you can install it via pip and load it as a
| module. spaCy comes with a handy #[+a("/docs/usage/cli#package") CLI command]
| to create all required files and directories.
| module. spaCy comes with a handy #[+api("cli#package") #[code package]]
| CLI command to create all required files and directories.
+code(false, "bash").
python -m spacy package /home/me/data/en_technology /home/me/my_models

View File

@ -6,6 +6,10 @@ p
| Once the model is trained, you can then
| #[+a("/docs/usage/saving-loading") save and load] it.
+h(2, "101") Training 101
include _spacy-101/_training
+h(2, "train-pos-tagger") Training the part-of-speech tagger
+code.

View File

@ -1,190 +0,0 @@
//- 💫 DOCS > USAGE > TROUBLESHOOTING
include ../../_includes/_mixins
p
| This section collects some of the most common errors you may come
| across when installing, loading and using spaCy, as well as their solutions.
+aside("Help us improve this guide")
| Did you come across a problem like the ones listed here and want to
| share the solution? You can find the "Suggest edits" button at the
| bottom of this page that points you to the source. We always
| appreciate #[+a(gh("spaCy") + "/pulls") pull requests]!
+h(2, "install-loading") Installation and loading
+h(3, "compatible-model") No compatible model found
+code(false, "text").
No compatible model found for [lang] (spaCy v#{SPACY_VERSION}).
p
| This usually means that the model you're trying to download does not
| exist, or isn't available for your version of spaCy.
+infobox("Solutions")
| Check the #[+a(gh("spacy-models", "compatibility.json")) compatibility table]
| to see which models are available for your spaCy version. If you're using
| an old version, consider upgrading to the latest release. Note that while
| spaCy supports tokenization for
| #[+a("/docs/api/language-models/#alpha-support") a variety of languages],
| not all of them come with statistical models. To only use the tokenizer,
| import the language's #[code Language] class instead, for example
| #[code from spacy.fr import French].
+h(3, "symlink-privilege") Symbolic link privilege not held
+code(false, "text").
OSError: symbolic link privilege not held
p
| To create #[+a("/docs/usage/models/#usage") shortcut links] that let you
| load models by name, spaCy creates a symbolic link in the
| #[code spacy/data] directory. This means your user needs permission to do
| this. The above error mostly occurs when doing a system-wide installation,
| which will create the symlinks in a system directory.
+infobox("Solutions")
| Run the #[code download] or #[code link] command as administrator,
| or use a #[code virtualenv] to install spaCy in a user directory, instead
| of doing a system-wide installation.
+h(3, "no-cache-dir") No such option: --no-cache-dir
+code(false, "text").
no such option: --no-cache-dir
p
| The #[code download] command uses pip to install the models and sets the
| #[code --no-cache-dir] flag to prevent it from requiring too much memory.
| #[+a("https://pip.pypa.io/en/stable/reference/pip_install/#caching") This setting]
| requires pip v6.0 or newer.
+infobox("Solution")
| Run #[code pip install -U pip] to upgrade to the latest version of pip.
| To see which version you have installed, run #[code pip --version].
+h(3, "import-error") Import error
+code(false, "text").
Import Error: No module named spacy
p
| This error means that the spaCy module can't be located on your system, or in
| your environment.
+infobox("Solutions")
| Make sure you have spaCy installed. If you're using a #[code virtualenv],
| make sure it's activated and check that spaCy is installed in that
| environment otherwise, you're trying to load a system installation. You
| can also run #[code which python] to find out where your Python
| executable is located.
+h(3, "import-error-models") Import error: models
+code(false, "text").
ImportError: No module named 'en_core_web_sm'
p
| As of spaCy v1.7, all models can be installed as Python packages. This means
| that they'll become importable modules of your application. When creating
| #[+a("/docs/usage/models/#usage") shortcut links], spaCy will also try
| to import the model to load its meta data. If this fails, it's usually a
| sign that the package is not installed in the current environment.
+infobox("Solutions")
| Run #[code pip list] or #[code pip freeze] to check which model packages
| you have installed, and install the
| #[+a("/docs/usage/models#available") correct models] if necessary. If you're
| importing a model manually at the top of a file, make sure to use the name
| of the package, not the shortcut link you've created.
+h(3, "vocab-strings") File not found: vocab/strings.json
+code(false, "text").
FileNotFoundError: No such file or directory: [...]/vocab/strings.json
p
| This error may occur when using #[code spacy.load()] to load
| a language model either because you haven't set up a
| #[+a("/docs/usage/models/#usage") shortcut link] for it, or because it
| doesn't actually exist.
+infobox("Solutions")
| Set up a #[+a("/docs/usage/models/#usage") shortcut link] for the model
| you want to load. This can either be an installed model package, or a
| local directory containing the model data. If you want to use one of the
| #[+a("/docs/api/language-models/#alpha-support") alpha tokenizers] for
| languages that don't yet have a statistical model, you should import its
| #[code Language] class instead, for example
| #[code from spacy.fr import French].
+h(3, "command-not-found") Command not found
+code(false, "text").
command not found: spacy
p
| This error may occur when running the #[code spacy] command from the
| command line. spaCy does not currently add an entry to our #[code PATH]
| environment variable, as this can lead to unexpected results, especially
| when using #[code virtualenv]. Instead, commands need to be prefixed with
| #[code python -m].
+infobox("Solution")
| Run the command with #[code python -m], for example
| #[code python -m spacy download en]. For more info on this, see the
| #[+a("/docs/usage/cli") CLI documentation].
+h(3, "module-load") 'module' object has no attribute 'load'
+code(false, "text").
AttributeError: 'module' object has no attribute 'load'
p
| While this could technically have many causes, including spaCy being
| broken, the most likely one is that your script's file or directory name
| is "shadowing" the module e.g. your file is called #[code spacy.py],
| or a directory you're importing from is called #[code spacy].
+infobox("Solution")
| When using spaCy, never call anything else #[code spacy].
+h(2, "usage") Using spaCy
+h(3, "pos-lemma-number") POS tag or lemma is returned as number
+code.
doc = nlp(u'This is text.')
print([word.pos for word in doc])
# [88, 98, 90, 95]
p
| Like many NLP libraries, spaCy encodes all strings to integers. This
| reduces memory usage and improves efficiency. The integer mapping also
| makes it easy to interoperate with numpy. To access the string
| representation instead of the integer ID, add an underscore #[code _]
| after the attribute.
+infobox("Solutions")
| Use #[code pos_] or #[code lemma_] instead. See the
| #[+api("token#attributes") #[code Token] attributes] for a list of available
| attributes and their string representations.
+h(3, "pron-lemma") Pronoun lemma is returned as #[code -PRON-]
+code.
doc = nlp(u'They are')
print(doc[0].lemma_)
# -PRON-
p
| This is in fact expected behaviour and not a bug.
| Unlike verbs and common nouns, there's no clear base form of a personal
| pronoun. Should the lemma of "me" be "I", or should we normalize person
| as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a
| novel symbol, #[code -PRON-], which is used as the lemma for
| all personal pronouns. For more info on this, see the
| #[+api("annotation#lemmatization") annotation specs] on lemmatization.

View File

@ -8,6 +8,65 @@ p
+h(2, "features") New features
+h(3, "features-pipelines") Improved processing pipelines
+aside-code("Example").
# Modify an existing pipeline
nlp = spacy.load('en')
nlp.pipeline.append(my_component)
# Register a factory to create a component
spacy.set_factory('my_factory', my_factory)
nlp = Language(pipeline=['my_factory', mycomponent])
p
| It's now much easier to customise the pipeline with your own components.
| Components are functions that receive a #[code Doc] object, modify and
| return it. If your component is stateful, you'll want to create a new one
| for each pipeline. You can do that by defining and registering a factory
| which receives the shared #[code Vocab] object and returns a component.
p
| spaCy's default components the vectorizer, tagger, parser and entity
| recognizer, can be added to your pipeline by using their string IDs.
| This way, you won't have to worry about finding and implementing them
| to use the default tagger, simply add #[code "tagger"] to the pipeline,
| and spaCy will know what to do.
+infobox
| #[strong API:] #[+api("language") #[code Language]]
| #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text]
+h(3, "features-serializer") Saving, loading and serialization
+aside-code("Example").
nlp = spacy.load('en') # shortcut link
nlp = spacy.load('en_core_web_sm') # package
nlp = spacy.load('/path/to/en') # unicode path
nlp = spacy.load(Path('/path/to/en')) # pathlib Path
nlp.to_disk('/path/to/nlp')
nlp = English().from_disk('/path/to/nlp')
p
| spay's serialization API has been made consistent across classes and
| objects. All container classes and pipeline components now have a
| #[code to_bytes()], #[code from_bytes()], #[code to_disk()] and
| #[code from_disk()] method that supports the Pickle protocol.
p
| The improved #[code spacy.load] makes loading models easier and more
| transparent. You can load a model by supplying its
| #[+a("/docs/usage/models#usage") shortcut link], the name of an installed
| #[+a("/docs/usage/saving-loading#generating") model package] or a path.
| The #[code Language] class to initialise will be determined based on the
| model's settings. For a blank language, you can import the class directly,
| e.g. #[code from spacy.lang.en import English].
+infobox
| #[strong API:] #[+api("spacy#load") #[code spacy.load]], #[+api("binder") #[code Binder]]
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
+h(3, "features-displacy") displaCy visualizer with Jupyter support
+aside-code("Example").
@ -28,65 +87,23 @@ p
| #[strong API:] #[+api("displacy") #[code displacy]]
| #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizing spaCy]
+h(3, "features-loading") Loading
+aside-code("Example").
nlp = spacy.load('en') # shortcut link
nlp = spacy.load('en_core_web_sm') # package
nlp = spacy.load('/path/to/en') # unicode path
nlp = spacy.load(Path('/path/to/en')) # pathlib Path
+h(3, "features-language") Improved language data and lazy loading
p
| The improved #[code spacy.load] makes loading models easier and more
| transparent. You can load a model by supplying its
| #[+a("/docs/usage/models#usage") shortcut link], the name of an installed
| #[+a("/docs/usage/saving-loading#generating") model package], a unicode
| path or a #[code Path]-like object. spaCy will try resolving the load
| argument in this order. The #[code path] keyword argument is now deprecated.
p
| The #[code Language] class to initialise will be determined based on the
| model's settings. If no model is found, spaCy will let you know and won't
| just return an empty #[code Language] object anymore. If you want a blank
| language, you can always import the class directly, e.g.
| #[code from spacy.lang.en import English].
+infobox
| #[strong API:] #[+api("spacy#load") #[code spacy.load]]
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
+h(3, "features-language") Improved language data and processing pipelines
+aside-code("Example").
from spacy.language import Language
nlp = Language(pipeline=['token_vectors', 'tags',
'dependencies'])
+infobox
| #[strong API:] #[+api("language") #[code Language]]
| #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages]
+h(3, "features-lemmatizer") Simple lookup-based lemmatization
+aside-code("Example").
LOOKUP = {
"aba": "abar",
"ababa": "abar",
"ababais": "abar",
"ababan": "abar",
"ababanes": "ababán"
}
p
| spaCy now supports simple lookup-based lemmatization. The data is stored
| in a dictionary mapping a string to its lemma. To determine a token's
| lemma, spaCy simply looks it up in the table. The lookup lemmatizer can
| be imported from #[code spacy.lemmatizerlookup]. It's initialised with
| the lookup table, and should be returned by the #[code create_lemmatizer]
| classmethod of the language's defaults.
| Language-specfic data now lives in its own submodule, #[code spacy.lang].
| Languages are lazy-loaded, i.e. only loaded when you import a
| #[code Language] class, or load a model that initialises one. This allows
| languages to contain more custom data, e.g. lemmatizer lookup tables, or
| complex regular expressions. The language data has also been tidied up
| and simplified. It's now also possible to overwrite the functions that
| compute lexical attributes like #[code like_num], and supply
| language-specific syntax iterators, e.g. to determine noun chunks. spaCy
| now also supports simple lookup-based lemmatization. The data is stored
| in a dictionary mapping a string to its lemma.
+infobox
| #[strong API:] #[+api("language") #[code Language]]
| #[strong Code:] #[+src(gh("spaCy", "spacy/lang")) spacy/lang]
| #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages]
+h(3, "features-matcher") Revised matcher API
@ -95,7 +112,7 @@ p
from spacy.matcher import Matcher
from spacy.attrs import LOWER, IS_PUNCT
matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', on_match=None,
matcher.add('HelloWorld', None,
[{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
[{LOWER: 'hello'}, {LOWER: 'world'}])
assert len(matcher) == 1
@ -113,12 +130,6 @@ p
| #[strong API:] #[+api("matcher") #[code Matcher]]
| #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching]
+h(3, "features-serializer") Serialization
+infobox
| #[strong API:] #[+api("serializer") #[code Serializer]]
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
+h(3, "features-models") Neural network models for English, German, French and Spanish
+infobox
@ -128,6 +139,18 @@ p
+h(2, "incompat") Backwards incompatibilities
+table(["Old", "New"])
+row
+cell
| #[code spacy.en]
| #[code spacy.xx]
+cell
| #[code spacy.lang.en]
| #[code spacy.lang.xx]
+row
+cell #[code spacy.orth]
+cell #[code spacy.lang.xx.lex_attrs]
+row
+cell #[code Language.save_to_directory]
+cell #[+api("language#to_disk") #[code Language.to_disk]]
@ -214,6 +237,94 @@ p
+cell #[code Token.is_ancestor_of]
+cell #[+api("token#is_ancestor") #[code Token.is_ancestor]]
+h(2, "migrating") Migrating from spaCy 1.x
+list
+item Saving, loading and serialization.
+item Processing pipelines and language data.
+item Adding patterns and callbacks to the matcher.
+item Models trained with spaCy 1.x.
+infobox("Some tips")
| Before migrating, we strongly recommend writing a few
| #[strong simple tests] specific to how you're using spaCy in your
| application. This makes it easier to check whether your code requires
| changes, and if so, which parts are affected.
| (By the way, feel free contribute your tests to
| #[+src(gh("spaCy", "spacy/tests")) our test suite] this will also ensure
| we never accidentally introduce a bug in a workflow that's
| important to you.) If you've trained your own models, keep in mind that
| your train and runtime inputs must match. This means you'll have to
| #[strong retrain your models] with spaCy v2.0 to make them compatible.
+h(3, "migrating-saving-loading") Saving, loading and serialization
p
| Double-check all calls to #[code spacy.load()] and make sure they don't
| use the #[code path] keyword argument. If you're only loading in binary
| data and not a model package that can construct its own #[code Language]
| class and pipeline, you should now use the
| #[+api("language#from_disk") #[code Language.from_disk()]] method.
+code-new.
nlp = spacy.load('/model')
nlp = English().from_disk('/model/data')
+code-old nlp = spacy.load('en', path='/model')
p
| Review all other code that writes state to disk or bytes.
| All containers, now share the same, consistent API for saving and
| loading. Replace saving with #[code to_disk()] or #[code to_bytes()], and
| loading with #[code from_disk()] and #[code from_bytes()].
+code-new.
nlp.to_disk('/model')
nlp.vocab.to_disk('/vocab')
+code-old.
nlp.save_to_directory('/model')
nlp.vocab.dump('/vocab')
+h(3, "migrating-languages") Processing pipelines and language data
p
| If you're importing language data or #[code Language] classes, make sure
| to change your import statements to import from #[code spacy.lang]. If
| you've added your own custom language, it needs to be moved to
| #[code spacy/lang/xx] and adjusted accordingly.
+code-new from spacy.lang.en import English
+code-old from spacy.en import English
p
| If you've been using custom pipeline components, check out the new
| guide on #[+a("/docs/usage/language-processing-pipelines") processing pipelines].
| Appending functions to the pipeline still works but you might be able
| to make this more convenient by registering "component factories".
| Components of the processing pipeline can now be disabled by passing a
| list of their names to the #[code disable] keyword argument on loading
| or processing.
+code-new.
nlp = spacy.load('en', disable=['tagger', 'ner'])
doc = nlp(u"I don't want parsed", disable=['parser'])
+code-old.
nlp = spacy.load('en', tagger=False, entity=False)
doc = nlp(u"I don't want parsed", parse=False)
+h(3, "migrating-matcher") Adding patterns and callbacks to the matcher
p
| If you're using the matcher, you can now add patterns in one step. This
| should be easy to update simply merge the ID, callback and patterns
| into one call to #[+api("matcher#add") #[code matcher.add]].
+code-new.
matcher.add('GoogleNow', merge_phrases, [{ORTH: 'Google'}, {ORTH: 'Now'}])
+code-old.
matcher.add_entity('GoogleNow', on_match=merge_phrases)
matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
+h(3, "migrating-models") Trained models

View File

@ -180,8 +180,8 @@ p
p
| If you don't need the web server and just want to generate the markup
| for example, to export it to a file or serve it in a custom
| way you can use #[+api("displacy#render") #[code displacy.render]]
| instead. It works the same, but returns a string containing the markup.
| way you can use #[+api("displacy#render") #[code displacy.render]].
| It works the same way, but returns a string containing the markup.
+code("Example").
import spacy
@ -220,10 +220,32 @@ p
| a standalone graphic.) So instead of rendering all #[code Doc]s at one,
| loop over them and export them separately.
+h(3, "examples-export-svg") Example: Export SVG graphics of dependency parses
+code("Example").
import spacy
from spacy import displacy
from pathlib import Path
nlp = spacy.load('en')
sentences = ["This is an example.", "This is another one."]
for sent in sentences:
doc = nlp(sentence)
svg = displacy.render(doc, style='dep')
file_name = '-'.join([w.text for w in doc if not w.is_punct]) + '.svg'
output_path = Path('/images/' + file_name)
output_path.open('w', encoding='utf-8').write(svg)
p
| The above code will generate the dependency visualizations and them to
| two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg].
+h(2, "jupyter") Using displaCy in Jupyter notebooks
p
| displaCy is able to detect whether you're within a
| displaCy is able to detect whether you're working in a
| #[+a("https://jupyter.org") Jupyter] notebook, and will return markup
| that can be rendered in a cell straight away. When you export your
| notebook, the visualizations will be included as HTML.
@ -257,28 +279,6 @@ p
html = displacy.render(doc, style='dep')
return display(HTML(html))
+h(2, "examples") Usage examples
+h(3, "examples-export-svg") Export SVG graphics of dependency parses
+code("Example").
import spacy
from spacy import displacy
from pathlib import Path
nlp = spacy.load('en')
sentences = ["This is an example.", "This is another one."]
for sent in sentences:
doc = nlp(sentence)
svg = displacy.render(doc, style='dep')
file_name = '-'.join([w.text for w in doc if not w.is_punct]) + '.svg'
output_path = Path('/images/' + file_name)
output_path.open('w', encoding='utf-8').write(svg)
p
| The above code will generate the dependency visualizations and them to
| two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg].
+h(2, "manual-usage") Rendering data manually
p
@ -314,3 +314,62 @@ p
'text': 'But Google is starting from behind.',
'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}],
'title': None
}
+h(2, "webapp") Using displaCy in a web application
p
| If you want to use the visualizers as part of a web application, for
| example to create something like our
| #[+a(DEMOS_URL + "/displacy") online demo], it's not recommended to
| simply wrap and serve the displaCy renderer. Instead, you should only
| rely on the server to perform spaCy's processing capabilities, and use
| #[+a(gh("displacy")) displaCy.js] to render the JSON-formatted output.
+aside("Why not return the HTML by the server?")
| It's certainly possible to just have your server return the markup.
| But outputting raw, unsanitised HTML is risky and makes your app vulnerable to
| #[+a("https://en.wikipedia.org/wiki/Cross-site_scripting") cross-site scripting]
| (XSS). All your user needs to do is find a way to make spaCy return one
| token #[code &lt;script src="malicious-code.js"&gt;&lt;script&gt;].
| Instead of relying on the server to render and sanitize HTML, you
| can do this on the client in JavaScript. displaCy.js creates
| the SVG markup as DOM nodes and will never insert raw HTML.
p
| The #[code parse_deps] function takes a #[code Doc] object and returns
| a dictionary in a format that can be rendered by displaCy.
+code("Example").
import spacy
from spacy import displacy
nlp = spacy.load('en')
def displacy_service(text):
doc = nlp(text)
return displacy.parse_deps(doc)
p
| Using a library like #[+a("https://falconframework.org/") Falcon] or
| #[+a("http://www.hug.rest/") Hug], you can easily turn the above code
| into a simple REST API that receives a text and returns a JSON-formatted
| parse. In your front-end, include #[+a(gh("displacy")) displacy.js] and
| initialise it with the API URL and the ID or query selector of the
| container to render the visualisation in, e.g. #[code '#displacy'] for
| #[code &lt;div id="displacy"&gt;].
+code("script.js", "javascript").
var displacy = new displaCy('http://localhost:8080', {
container: '#displacy'
})
function parse(text) {
displacy.parse(text);
}
p
| When you call #[code parse()], it will make a request to your API,
| receive the JSON-formatted parse and render it in your container. To
| create an interactive experience, you could trigger this function by
| a button and read the text from an #[code &lt;input&gt;] field.

View File

@ -6,46 +6,40 @@ p
| Dense, real valued vectors representing distributional similarity
| information are now a cornerstone of practical NLP. The most common way
| to train these vectors is the #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]
| family of algorithms.
+aside("Tip")
| If you need to train a word2vec model, we recommend the implementation in
| the Python library #[+a("https://radimrehurek.com/gensim/") Gensim].
p
| spaCy makes using word vectors very easy. The
| #[+api("lexeme") #[code Lexeme]], #[+api("token") #[code Token]],
| #[+api("span") #[code Span]] and #[+api("doc") #[code Doc]] classes all
| have a #[code .vector] property, which is a 1-dimensional numpy array of
| 32-bit floats:
+code.
import numpy
apples, and_, oranges = nlp(u'apples and oranges')
print(apples.vector.shape)
# (1,)
apples.similarity(oranges)
p
| By default, #[code Token.vector] returns the vector for its underlying
| lexeme, while #[code Doc.vector] and #[code Span.vector] return an
| average of the vectors of their tokens. You can customize these
| behaviours by modifying the #[code doc.user_hooks],
| #[code doc.user_span_hooks] and #[code doc.user_token_hooks]
| dictionaries.
+aside-code("Example").
# TODO
p
| The default English model installs vectors for one million vocabulary
| entries, using the 300-dimensional vectors trained on the Common Crawl
| family of algorithms. The default
| #[+a("/docs/usage/models#available") English model] installs
| 300-dimensional vectors trained on the Common Crawl
| corpus using the #[+a("http://nlp.stanford.edu/projects/glove/") GloVe]
| algorithm. The GloVe common crawl vectors have become a de facto
| standard for practical NLP.
+aside-code("Example").
+aside("Tip: Training a word2vec model")
| If you need to train a word2vec model, we recommend the implementation in
| the Python library #[+a("https://radimrehurek.com/gensim/") Gensim].
+h(2, "101") Similarity and word vectors 101
+tag-model("vectors")
include _spacy-101/_similarity
include _spacy-101/_word-vectors
+h(2, "custom") Customising word vectors
p
| By default, #[+api("token#vector") #[code Token.vector]] returns the
| vector for its underlying #[+api("lexeme") #[code Lexeme]], while
| #[+api("doc#vector") #[code Doc.vector]] and
| #[+api("span#vector") #[code Span.vector]] return an average of the
| vectors of their tokens.
p
| You can customize these
| behaviours by modifying the #[code doc.user_hooks],
| #[code doc.user_span_hooks] and #[code doc.user_token_hooks]
| dictionaries.
+code("Example").
# TODO
p
@ -56,11 +50,14 @@ p
| can use the #[code vocab.vectors_from_bin_loc()] method, which accepts a
| path to a binary file written by #[code vocab.dump_vectors()].
+aside-code("Example").
+code("Example").
# TODO
p
| You can also load vectors from memory, by writing to the #[code lexeme.vector]
| property. If the vectors you are writing are of different dimensionality
| You can also load vectors from memory by writing to the
| #[+api("lexeme#vector") #[code Lexeme.vector]] property. If the vectors
| you are writing are of different dimensionality
| from the ones currently loaded, you should first call
| #[code vocab.resize_vectors(new_size)].
+h(2, "similarity") Similarity