# coding: utf8 from __future__ import unicode_literals # The use of this module turns out to be important, to avoid pathological # back-tracking. See Issue #957 import regex as re from ..symbols import ORTH, POS, LEMMA, SPACE, PUNCT # URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex # A few minor mods to this regex to account for use cases represented in test_urls URL_PATTERN = ( r"^" # in order to support the prefix tokenization (see prefix test cases in test_urls). r"(?=[\w])" # protocol identifier r"(?:(?:https?|ftp|mailto)://)?" # user:pass authentication r"(?:\S+(?::\S*)?@)?" r"(?:" # IP address exclusion # private & local networks r"(?!(?:10|127)(?:\.\d{1,3}){3})" r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})" r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})" # IP address dotted notation octets # excludes loopback network 0.0.0.0 # excludes reserved space >= 224.0.0.0 # excludes network & broadcast addresses # (first & last IP address of each class) # MH: Do we really need this? Seems excessive, and seems to have caused # Issue #957 r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" r"|" # host name r"(?:(?:[a-z0-9\-]*)?[a-z0-9]+)" # domain name r"(?:\.(?:[a-z0-9\-])*[a-z0-9]+)*" # TLD identifier r"(?:\.(?:[a-z]{2,}))" r")" # port number r"(?::\d{2,5})?" # resource path r"(?:/\S*)?" # query parameters r"\??(:?\S*)?" # in order to support the suffix tokenization (see suffix test cases in test_urls), r"(?<=[\w/])" r"$" ).strip() TOKEN_MATCH = re.compile(URL_PATTERN, re.UNICODE).match BASE_EXCEPTIONS = {} for exc_data in [ {ORTH: " ", POS: SPACE}, {ORTH: "\t", POS: SPACE}, {ORTH: "\\t", POS: SPACE}, {ORTH: "\n", POS: SPACE}, {ORTH: "\\n", POS: SPACE}, {ORTH: "\u2014", POS: PUNCT, LEMMA: "--"}, {ORTH: "\u00a0", POS: SPACE, LEMMA: " "}, ]: BASE_EXCEPTIONS[exc_data[ORTH]] = [exc_data] for orth in [ "'", '\\")', "", "''", "C++", "a.", "b.", "c.", "d.", "e.", "f.", "g.", "h.", "i.", "j.", "k.", "l.", "m.", "n.", "o.", "p.", "q.", "r.", "s.", "t.", "u.", "v.", "w.", "x.", "y.", "z.", "ä.", "ö.", "ü.", ]: BASE_EXCEPTIONS[orth] = [{ORTH: orth}] emoticons = set( """ :) :-) :)) :-)) :))) :-))) (: (-: =) (= ") :] :-] [: [-: :o) (o: :} :-} 8) 8-) (-8 ;) ;-) (; (-; :( :-( :(( :-(( :((( :-((( ): )-: =( >:( :') :'-) :'( :'-( :/ :-/ =/ =| :| :-| :1 :P :-P :p :-p :O :-O :o :-o :0 :-0 :() >:o :* :-* :3 :-3 =3 :> :-> :X :-X :x :-x :D :-D ;D ;-D =D xD XD xDD XDD 8D 8-D ^_^ ^__^ ^___^ >.< >.> <.< ._. ;_; -_- -__- v.v V.V v_v V_V o_o o_O O_o O_O 0_o o_0 0_0 o.O O.o O.O o.o 0.0 o.0 0.o @_@ <3 <33 <333 _<) (*_*) (¬_¬) ಠ_ಠ ಠ︵ಠ (ಠ_ಠ) ¯\(ツ)/¯ (╯°□°)╯︵┻━┻ ><(((*> """.split() ) for orth in emoticons: BASE_EXCEPTIONS[orth] = [{ORTH: orth}]