From e7f95c37eeced064d6e239fa6a699a0bfd256501 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 8 May 2017 15:55:52 +0200 Subject: [PATCH] Merge base tokenizer exceptions --- spacy/language_data/abbreviations.py | 43 ------ spacy/language_data/emoticons.py | 148 ------------------ spacy/language_data/tokenizer_exceptions.py | 161 +++++++++++++++++++- 3 files changed, 160 insertions(+), 192 deletions(-) delete mode 100644 spacy/language_data/abbreviations.py delete mode 100644 spacy/language_data/emoticons.py diff --git a/spacy/language_data/abbreviations.py b/spacy/language_data/abbreviations.py deleted file mode 100644 index a3e95ce1a..000000000 --- a/spacy/language_data/abbreviations.py +++ /dev/null @@ -1,43 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - - -ABBREVIATIONS = [ - "'", - "\\\")", - "", - "''", - "C++", - "a.", - "b.", - "c.", - "d.", - "e.", - "f.", - "g.", - "h.", - "i.", - "j.", - "k.", - "l.", - "m.", - "n.", - "o.", - "p.", - "q.", - "r.", - "s.", - "t.", - "u.", - "v.", - "w.", - "x.", - "y.", - "z.", - "ä.", - "ö.", - "ü." -] - - -__all__ = [ "ABBREVIATIONS" ] diff --git a/spacy/language_data/emoticons.py b/spacy/language_data/emoticons.py deleted file mode 100644 index 223176c9c..000000000 --- a/spacy/language_data/emoticons.py +++ /dev/null @@ -1,148 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - - -EMOTICONS = set(""" -:) -:-) -:)) -:-)) -:))) -:-))) -(: -(-: -=) -(= -") -:] -:-] -[: -[-: -:o) -(o: -:} -:-} -8) -8-) -(-8 - -;) -;-) -(; -(-; - -:( -:-( -:(( -:-(( -:((( -:-((( -): -)-: -=( ->:( - -:') -:'-) -:'( -:'-( - -:/ -:-/ -=/ -=| -:| -:-| -:1 - -:P -:-P -:p -:-p - -:O -:-O -:o -:-o -:0 -:-0 -:() ->:o - -:* -:-* -:3 -:-3 -=3 -:> -:-> - -:X -:-X -:x -:-x - -:D -:-D -;D -;-D -=D -xD -XD -xDD -XDD -8D -8-D - -^_^ -^__^ -^___^ ->.< ->.> -<.< -._. -;_; --_- --__- -v.v -V.V -v_v -V_V -o_o -o_O -O_o -O_O -0_o -o_0 -0_0 -o.O -O.o -O.O -o.o -0.0 -o.0 -0.o -@_@ - -<3 -<33 -<333 -_<) -(*_*) -(¬_¬) - -ಠ_ಠ -ಠ︵ಠ -(ಠ_ಠ) -¯\(ツ)/¯ -(╯°□°)╯︵┻━┻ -><(((*> -""".split()) - - -__all__ = [ "EMOTICONS" ] diff --git a/spacy/language_data/tokenizer_exceptions.py b/spacy/language_data/tokenizer_exceptions.py index b84adb2c4..4c6d0fad2 100644 --- a/spacy/language_data/tokenizer_exceptions.py +++ b/spacy/language_data/tokenizer_exceptions.py @@ -1,9 +1,13 @@ +# coding: utf8 from __future__ import unicode_literals # The use of this module turns out to be important, to avoid pathological # back-tracking. See Issue #957 import regex +from ..symbols import ORTH, POS, LEMMA, SPACE, PUNCT + + # URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex # A few minor mods to this regex to account for use cases represented in test_urls _URL_PATTERN = ( @@ -51,4 +55,159 @@ _URL_PATTERN = ( TOKEN_MATCH = regex.compile(_URL_PATTERN, regex.UNICODE).match -__all__ = ['TOKEN_MATCH'] + + +BASE_EXCEPTIONS = {} + + +for exc_data in [ + {ORTH: " ", POS: SPACE}, + {ORTH: "\t", POS: SPACE}, + {ORTH: "\\t", POS: SPACE}, + {ORTH: "\n", POS: SPACE}, + {ORTH: "\\n", POS: SPACE}, + {ORTH: "\u2014", POS: PUNCT, LEMMA: "--"}, + {ORTH: "\u00a0", POS: SPACE, LEMMA: " "}]: + BASE_EXCEPTIONS[exc_data[ORTH]] = [dict(exc_data)] + + +for orth in [ + "'", "\\\")", "", "''", "C++", "a.", "b.", "c.", "d.", "e.", "f.", + "g.", "h.", "i.", "j.", "k.", "l.", "m.", "n.", "o.", "p.", "q.", "r.", + "s.", "t.", "u.", "v.", "w.", "x.", "y.", "z.", "ä.", "ö.", "ü."]: + BASE_EXCEPTIONS[orth] = [{ORTH: orth}] + + +emoticons = set(""" +:) +:-) +:)) +:-)) +:))) +:-))) +(: +(-: +=) +(= +") +:] +:-] +[: +[-: +:o) +(o: +:} +:-} +8) +8-) +(-8 +;) +;-) +(; +(-; +:( +:-( +:(( +:-(( +:((( +:-((( +): +)-: +=( +>:( +:') +:'-) +:'( +:'-( +:/ +:-/ +=/ +=| +:| +:-| +:1 +:P +:-P +:p +:-p +:O +:-O +:o +:-o +:0 +:-0 +:() +>:o +:* +:-* +:3 +:-3 +=3 +:> +:-> +:X +:-X +:x +:-x +:D +:-D +;D +;-D +=D +xD +XD +xDD +XDD +8D +8-D + +^_^ +^__^ +^___^ +>.< +>.> +<.< +._. +;_; +-_- +-__- +v.v +V.V +v_v +V_V +o_o +o_O +O_o +O_O +0_o +o_0 +0_0 +o.O +O.o +O.O +o.o +0.0 +o.0 +0.o +@_@ +<3 +<33 +<333 +_<) +(*_*) +(¬_¬) +ಠ_ಠ +ಠ︵ಠ +(ಠ_ಠ) +¯\(ツ)/¯ +(╯°□°)╯︵┻━┻ +><(((*> +""".split()) + + +for orth in emoticons: + BASE_EXCEPTIONS[orth] = [{ORTH: orth}]