mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
Update symbols and document missing token attributes (see #1439)
This commit is contained in:
parent
4acab77a8a
commit
108f1f786e
|
@ -13,12 +13,12 @@ cdef enum symbol_t:
|
||||||
LIKE_EMAIL
|
LIKE_EMAIL
|
||||||
IS_STOP
|
IS_STOP
|
||||||
IS_OOV
|
IS_OOV
|
||||||
|
IS_BRACKET
|
||||||
|
IS_QUOTE
|
||||||
|
IS_LEFT_PUNCT
|
||||||
|
IS_RIGHT_PUNCT
|
||||||
|
|
||||||
FLAG14 = 14
|
FLAG18 = 18
|
||||||
FLAG15
|
|
||||||
FLAG16
|
|
||||||
FLAG17
|
|
||||||
FLAG18
|
|
||||||
FLAG19
|
FLAG19
|
||||||
FLAG20
|
FLAG20
|
||||||
FLAG21
|
FLAG21
|
||||||
|
@ -455,16 +455,5 @@ cdef enum symbol_t:
|
||||||
root
|
root
|
||||||
xcomp
|
xcomp
|
||||||
|
|
||||||
# Move these up to FLAG14--FLAG18 once we finish the functionality
|
acl
|
||||||
# and are ready to regenerate the model.
|
|
||||||
#IS_BRACKET
|
|
||||||
#IS_QUOTE
|
|
||||||
#IS_LEFT_PUNCT
|
|
||||||
#IS_RIGHT_PUNCT
|
|
||||||
|
|
||||||
# These symbols are currently missing. However, if we add them currently,
|
|
||||||
# we'll throw off the integer index and the model will have to be retrained.
|
|
||||||
# We therefore wait until the next data version to add them.
|
|
||||||
# acl
|
|
||||||
|
|
||||||
LAW
|
LAW
|
||||||
|
|
|
@ -18,10 +18,11 @@ IDS = {
|
||||||
"LIKE_EMAIL": LIKE_EMAIL,
|
"LIKE_EMAIL": LIKE_EMAIL,
|
||||||
"IS_STOP": IS_STOP,
|
"IS_STOP": IS_STOP,
|
||||||
"IS_OOV": IS_OOV,
|
"IS_OOV": IS_OOV,
|
||||||
"FLAG14": FLAG14,
|
"IS_BRACKET": IS_BRACKET,
|
||||||
"FLAG15": FLAG15,
|
"IS_QUOTE": IS_QUOTE,
|
||||||
"FLAG16": FLAG16,
|
"IS_LEFT_PUNCT": IS_LEFT_PUNCT,
|
||||||
"FLAG17": FLAG17,
|
"IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
|
||||||
|
|
||||||
"FLAG18": FLAG18,
|
"FLAG18": FLAG18,
|
||||||
"FLAG19": FLAG19,
|
"FLAG19": FLAG19,
|
||||||
"FLAG20": FLAG20,
|
"FLAG20": FLAG20,
|
||||||
|
@ -457,7 +458,9 @@ IDS = {
|
||||||
"quantmod": quantmod,
|
"quantmod": quantmod,
|
||||||
"rcmod": rcmod,
|
"rcmod": rcmod,
|
||||||
"root": root,
|
"root": root,
|
||||||
"xcomp": xcomp
|
"xcomp": xcomp,
|
||||||
|
|
||||||
|
"acl": acl,
|
||||||
"LAW": LAW
|
"LAW": LAW
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -586,6 +586,16 @@ p The L2 norm of the token's vector representation.
|
||||||
+cell bool
|
+cell bool
|
||||||
+cell Is the token punctuation?
|
+cell Is the token punctuation?
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code is_left_punct]
|
||||||
|
+cell bool
|
||||||
|
+cell Is the token a left punctuation mark, e.g. #[code (]?
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code is_right_punct]
|
||||||
|
+cell bool
|
||||||
|
+cell Is the token a right punctuation mark, e.g. #[code )]?
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code is_space]
|
+cell #[code is_space]
|
||||||
+cell bool
|
+cell bool
|
||||||
|
@ -593,6 +603,16 @@ p The L2 norm of the token's vector representation.
|
||||||
| Does the token consist of whitespace characters? Equivalent to
|
| Does the token consist of whitespace characters? Equivalent to
|
||||||
| #[code token.text.isspace()].
|
| #[code token.text.isspace()].
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code is_bracket]
|
||||||
|
+cell bool
|
||||||
|
+cell Is the token a bracket?
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code is_quote]
|
||||||
|
+cell bool
|
||||||
|
+cell Is the token a quotation mark?
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code like_url]
|
+cell #[code like_url]
|
||||||
+cell bool
|
+cell bool
|
||||||
|
|
Loading…
Reference in New Issue
Block a user