Update symbols and document missing token attributes (see #1439)

This commit is contained in:
ines 2017-10-20 13:08:44 +02:00
parent 4acab77a8a
commit 108f1f786e
3 changed files with 34 additions and 22 deletions

View File

@ -13,12 +13,12 @@ cdef enum symbol_t:
LIKE_EMAIL LIKE_EMAIL
IS_STOP IS_STOP
IS_OOV IS_OOV
IS_BRACKET
IS_QUOTE
IS_LEFT_PUNCT
IS_RIGHT_PUNCT
FLAG14 = 14 FLAG18 = 18
FLAG15
FLAG16
FLAG17
FLAG18
FLAG19 FLAG19
FLAG20 FLAG20
FLAG21 FLAG21
@ -455,16 +455,5 @@ cdef enum symbol_t:
root root
xcomp xcomp
# Move these up to FLAG14--FLAG18 once we finish the functionality acl
# and are ready to regenerate the model.
#IS_BRACKET
#IS_QUOTE
#IS_LEFT_PUNCT
#IS_RIGHT_PUNCT
# These symbols are currently missing. However, if we add them currently,
# we'll throw off the integer index and the model will have to be retrained.
# We therefore wait until the next data version to add them.
# acl
LAW LAW

View File

@ -18,10 +18,11 @@ IDS = {
"LIKE_EMAIL": LIKE_EMAIL, "LIKE_EMAIL": LIKE_EMAIL,
"IS_STOP": IS_STOP, "IS_STOP": IS_STOP,
"IS_OOV": IS_OOV, "IS_OOV": IS_OOV,
"FLAG14": FLAG14, "IS_BRACKET": IS_BRACKET,
"FLAG15": FLAG15, "IS_QUOTE": IS_QUOTE,
"FLAG16": FLAG16, "IS_LEFT_PUNCT": IS_LEFT_PUNCT,
"FLAG17": FLAG17, "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
"FLAG18": FLAG18, "FLAG18": FLAG18,
"FLAG19": FLAG19, "FLAG19": FLAG19,
"FLAG20": FLAG20, "FLAG20": FLAG20,
@ -457,7 +458,9 @@ IDS = {
"quantmod": quantmod, "quantmod": quantmod,
"rcmod": rcmod, "rcmod": rcmod,
"root": root, "root": root,
"xcomp": xcomp "xcomp": xcomp,
"acl": acl,
"LAW": LAW "LAW": LAW
} }

View File

@ -586,6 +586,16 @@ p The L2 norm of the token's vector representation.
+cell bool +cell bool
+cell Is the token punctuation? +cell Is the token punctuation?
+row
+cell #[code is_left_punct]
+cell bool
+cell Is the token a left punctuation mark, e.g. #[code (]?
+row
+cell #[code is_right_punct]
+cell bool
+cell Is the token a right punctuation mark, e.g. #[code )]?
+row +row
+cell #[code is_space] +cell #[code is_space]
+cell bool +cell bool
@ -593,6 +603,16 @@ p The L2 norm of the token's vector representation.
| Does the token consist of whitespace characters? Equivalent to | Does the token consist of whitespace characters? Equivalent to
| #[code token.text.isspace()]. | #[code token.text.isspace()].
+row
+cell #[code is_bracket]
+cell bool
+cell Is the token a bracket?
+row
+cell #[code is_quote]
+cell bool
+cell Is the token a quotation mark?
+row +row
+cell #[code like_url] +cell #[code like_url]
+cell bool +cell bool