diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index 4f1d35cf8..6960681a3 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -13,12 +13,12 @@ cdef enum symbol_t: LIKE_EMAIL IS_STOP IS_OOV + IS_BRACKET + IS_QUOTE + IS_LEFT_PUNCT + IS_RIGHT_PUNCT - FLAG14 = 14 - FLAG15 - FLAG16 - FLAG17 - FLAG18 + FLAG18 = 18 FLAG19 FLAG20 FLAG21 @@ -455,16 +455,5 @@ cdef enum symbol_t: root xcomp -# Move these up to FLAG14--FLAG18 once we finish the functionality -# and are ready to regenerate the model. -#IS_BRACKET -#IS_QUOTE -#IS_LEFT_PUNCT -#IS_RIGHT_PUNCT - -# These symbols are currently missing. However, if we add them currently, -# we'll throw off the integer index and the model will have to be retrained. -# We therefore wait until the next data version to add them. -# acl - + acl LAW diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index f64577309..0e0337b6e 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -18,10 +18,11 @@ IDS = { "LIKE_EMAIL": LIKE_EMAIL, "IS_STOP": IS_STOP, "IS_OOV": IS_OOV, - "FLAG14": FLAG14, - "FLAG15": FLAG15, - "FLAG16": FLAG16, - "FLAG17": FLAG17, + "IS_BRACKET": IS_BRACKET, + "IS_QUOTE": IS_QUOTE, + "IS_LEFT_PUNCT": IS_LEFT_PUNCT, + "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT, + "FLAG18": FLAG18, "FLAG19": FLAG19, "FLAG20": FLAG20, @@ -457,7 +458,9 @@ IDS = { "quantmod": quantmod, "rcmod": rcmod, "root": root, - "xcomp": xcomp + "xcomp": xcomp, + + "acl": acl, "LAW": LAW } diff --git a/website/api/token.jade b/website/api/token.jade index 465d44c66..4062594b4 100644 --- a/website/api/token.jade +++ b/website/api/token.jade @@ -586,6 +586,16 @@ p The L2 norm of the token's vector representation. +cell bool +cell Is the token punctuation? + +row + +cell #[code is_left_punct] + +cell bool + +cell Is the token a left punctuation mark, e.g. #[code (]? + + +row + +cell #[code is_right_punct] + +cell bool + +cell Is the token a right punctuation mark, e.g. #[code )]? + +row +cell #[code is_space] +cell bool @@ -593,6 +603,16 @@ p The L2 norm of the token's vector representation. | Does the token consist of whitespace characters? Equivalent to | #[code token.text.isspace()]. + +row + +cell #[code is_bracket] + +cell bool + +cell Is the token a bracket? + + +row + +cell #[code is_quote] + +cell bool + +cell Is the token a quotation mark? + +row +cell #[code like_url] +cell bool