mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Update symbols and document missing token attributes (see #1439)
This commit is contained in:
parent
4acab77a8a
commit
108f1f786e
|
@ -13,12 +13,12 @@ cdef enum symbol_t:
|
|||
LIKE_EMAIL
|
||||
IS_STOP
|
||||
IS_OOV
|
||||
IS_BRACKET
|
||||
IS_QUOTE
|
||||
IS_LEFT_PUNCT
|
||||
IS_RIGHT_PUNCT
|
||||
|
||||
FLAG14 = 14
|
||||
FLAG15
|
||||
FLAG16
|
||||
FLAG17
|
||||
FLAG18
|
||||
FLAG18 = 18
|
||||
FLAG19
|
||||
FLAG20
|
||||
FLAG21
|
||||
|
@ -455,16 +455,5 @@ cdef enum symbol_t:
|
|||
root
|
||||
xcomp
|
||||
|
||||
# Move these up to FLAG14--FLAG18 once we finish the functionality
|
||||
# and are ready to regenerate the model.
|
||||
#IS_BRACKET
|
||||
#IS_QUOTE
|
||||
#IS_LEFT_PUNCT
|
||||
#IS_RIGHT_PUNCT
|
||||
|
||||
# These symbols are currently missing. However, if we add them currently,
|
||||
# we'll throw off the integer index and the model will have to be retrained.
|
||||
# We therefore wait until the next data version to add them.
|
||||
# acl
|
||||
|
||||
acl
|
||||
LAW
|
||||
|
|
|
@ -18,10 +18,11 @@ IDS = {
|
|||
"LIKE_EMAIL": LIKE_EMAIL,
|
||||
"IS_STOP": IS_STOP,
|
||||
"IS_OOV": IS_OOV,
|
||||
"FLAG14": FLAG14,
|
||||
"FLAG15": FLAG15,
|
||||
"FLAG16": FLAG16,
|
||||
"FLAG17": FLAG17,
|
||||
"IS_BRACKET": IS_BRACKET,
|
||||
"IS_QUOTE": IS_QUOTE,
|
||||
"IS_LEFT_PUNCT": IS_LEFT_PUNCT,
|
||||
"IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
|
||||
|
||||
"FLAG18": FLAG18,
|
||||
"FLAG19": FLAG19,
|
||||
"FLAG20": FLAG20,
|
||||
|
@ -457,7 +458,9 @@ IDS = {
|
|||
"quantmod": quantmod,
|
||||
"rcmod": rcmod,
|
||||
"root": root,
|
||||
"xcomp": xcomp
|
||||
"xcomp": xcomp,
|
||||
|
||||
"acl": acl,
|
||||
"LAW": LAW
|
||||
}
|
||||
|
||||
|
|
|
@ -586,6 +586,16 @@ p The L2 norm of the token's vector representation.
|
|||
+cell bool
|
||||
+cell Is the token punctuation?
|
||||
|
||||
+row
|
||||
+cell #[code is_left_punct]
|
||||
+cell bool
|
||||
+cell Is the token a left punctuation mark, e.g. #[code (]?
|
||||
|
||||
+row
|
||||
+cell #[code is_right_punct]
|
||||
+cell bool
|
||||
+cell Is the token a right punctuation mark, e.g. #[code )]?
|
||||
|
||||
+row
|
||||
+cell #[code is_space]
|
||||
+cell bool
|
||||
|
@ -593,6 +603,16 @@ p The L2 norm of the token's vector representation.
|
|||
| Does the token consist of whitespace characters? Equivalent to
|
||||
| #[code token.text.isspace()].
|
||||
|
||||
+row
|
||||
+cell #[code is_bracket]
|
||||
+cell bool
|
||||
+cell Is the token a bracket?
|
||||
|
||||
+row
|
||||
+cell #[code is_quote]
|
||||
+cell bool
|
||||
+cell Is the token a quotation mark?
|
||||
|
||||
+row
|
||||
+cell #[code like_url]
|
||||
+cell bool
|
||||
|
|
Loading…
Reference in New Issue
Block a user