From 2fee67cfa3938576387fe10bd168cc0820a6e6c5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 7 Apr 2015 03:45:18 +0200 Subject: [PATCH] * Add regular expressions for English multi-word expressions --- spacy/en/regexes.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 spacy/en/regexes.py diff --git a/spacy/en/regexes.py b/spacy/en/regexes.py new file mode 100644 index 000000000..4eb45d87d --- /dev/null +++ b/spacy/en/regexes.py @@ -0,0 +1,35 @@ +import re + + +_mw_prepositions = [ + 'close to', + 'down by', + 'on the way to', + 'on my way', + 'on his way', + 'on her way', + 'on your way', + 'on our way', + 'on their way', +] + + + +MW_PREPOSITIONS_RE = re.compile('|'.join(_mw_prepositions), flags=re.IGNORECASE) + + +TIME_RE = re.compile( + '{colon_digits}|{colon_digits} ?{am_pm}?|{one_two_digits} ?({am_pm})'.format( + colon_digits=r'[0-2]?[0-9]:[0-5][0-9](?::[0-5][0-9])?', + one_two_digits=r'[0-2]?[0-9]', + am_pm=r'[ap]\.?m\.?')) + + +MONEY_RE = re.compile('\$\d+(?:\.\d+)?|\d+ dollars(?: \d+ cents)?') + + +DAYS_RE = re.compile('Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday') + + +REGEXES = [('IN', 'O', MW_PREPOSITIONS_RE), ('CD', 'TIME', TIME_RE), + ('NNP', 'DATE', DAYS_RE), ('CD', 'MONEY', MONEY_RE)]