From e47ee947613f697d68fa70fc6ffe4eef2b546487 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 8 Dec 2016 19:46:43 +0100 Subject: [PATCH] Split punctuation into its own file --- spacy/de/language_data.py | 123 +------------------------- spacy/en/language_data.py | 72 +--------------- spacy/language_data/__init__.py | 1 + spacy/language_data/punctuation.py | 133 +++++++++++++++++++++++++++++ 4 files changed, 140 insertions(+), 189 deletions(-) create mode 100644 spacy/language_data/punctuation.py diff --git a/spacy/de/language_data.py b/spacy/de/language_data.py index c979815b5..5c2b48a76 100644 --- a/spacy/de/language_data.py +++ b/spacy/de/language_data.py @@ -3,6 +3,9 @@ from __future__ import unicode_literals import re from ..symbols import * +from ..language_data import TOKENIZER_PREFIXES +from ..language_data import TOKENIZER_SUFFIXES +from ..language_data import TOKENIZER_INFIXES def strings_to_exc(orths): @@ -774,123 +777,3 @@ ORTH_ONLY = [ "y.", "z.", ] - - -TOKENIZER_PREFIXES = r''' -, -" -( -[ -{ -* -< -> -$ -£ -„ -“ -' -`` -` -# -US$ -C$ -A$ -a- -‘ -.... -... -… -‚ -» -§ -'''.strip().split('\n') - - -TOKENIZER_SUFFIXES = r''' -, -\" -\) -\] -\} -\* -\! -\? -% -\$ -> -: -; -' -” -“ -« -_ -'' -'s -'S -’s -’S -’ -‘ -° -€ -… -\.\. -\.\.\. -\.\.\.\. -(?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\. -\-\- -´ -(?<=[0-9])km² -(?<=[0-9])m² -(?<=[0-9])cm² -(?<=[0-9])mm² -(?<=[0-9])km³ -(?<=[0-9])m³ -(?<=[0-9])cm³ -(?<=[0-9])mm³ -(?<=[0-9])ha -(?<=[0-9])km -(?<=[0-9])m -(?<=[0-9])cm -(?<=[0-9])mm -(?<=[0-9])µm -(?<=[0-9])nm -(?<=[0-9])yd -(?<=[0-9])in -(?<=[0-9])ft -(?<=[0-9])kg -(?<=[0-9])g -(?<=[0-9])mg -(?<=[0-9])µg -(?<=[0-9])t -(?<=[0-9])lb -(?<=[0-9])oz -(?<=[0-9])m/s -(?<=[0-9])km/h -(?<=[0-9])mph -(?<=[0-9])°C -(?<=[0-9])°K -(?<=[0-9])°F -(?<=[0-9])hPa -(?<=[0-9])Pa -(?<=[0-9])mbar -(?<=[0-9])mb -(?<=[0-9])T -(?<=[0-9])G -(?<=[0-9])M -(?<=[0-9])K -(?<=[0-9])kb -'''.strip().split('\n') - - -TOKENIZER_INFIXES = r''' -… -\.\.\.+ -(?<=[a-z])\.(?=[A-Z]) -(?<=[a-zöäüßA-ZÖÄÜ"]):(?=[a-zöäüßA-ZÖÄÜ]) -(?<=[a-zöäüßA-ZÖÄÜ"])>(?=[a-zöäüßA-ZÖÄÜ]) -(?<=[a-zöäüßA-ZÖÄÜ"])<(?=[a-zöäüßA-ZÖÄÜ]) -(?<=[a-zöäüßA-ZÖÄÜ"])=(?=[a-zöäüßA-ZÖÄÜ]) -'''.strip().split('\n') diff --git a/spacy/en/language_data.py b/spacy/en/language_data.py index 72b00e5cd..5ed214159 100644 --- a/spacy/en/language_data.py +++ b/spacy/en/language_data.py @@ -3,6 +3,9 @@ from __future__ import unicode_literals import re from ..symbols import * +from ..language_data import TOKENIZER_PREFIXES +from ..language_data import TOKENIZER_SUFFIXES +from ..language_data import TOKENIZER_INFIXES def strings_to_exc(orths): @@ -2212,72 +2215,3 @@ ORTH_ONLY = [ "y.", "z." ] - - -TOKENIZER_PREFIXES = r''' -, -" -( -[ -{ -* -< -$ -£ -“ -' -`` -` -# -US$ -C$ -A$ -€ -a- -‘ -.... -... -… -'''.strip().split('\n') - - -TOKENIZER_SUFFIXES = r''' -, -\" -\) -\] -\} -\* -\! -\? -% -\$ -> -: -; -' -” -'' -'s -'S -’s -’S -’ -… -\.\. -\.\.\. -\.\.\.\. -(?<=[a-z0-9)\]”"'%\)])\. -(?<=[0-9])km -'''.strip().split('\n') - - -TOKENIZER_INFIXES = r''' -… -\.\.\.+ -(?<=[a-z])\.(?=[A-Z]) -(?<=[a-zA-Z])-(?=[a-zA-z]) -(?<=[a-zA-Z])--(?=[a-zA-z]) -(?<=[0-9])-(?=[0-9]) -(?<=[A-Za-z]),(?=[A-Za-z]) -'''.strip().split('\n') diff --git a/spacy/language_data/__init__.py b/spacy/language_data/__init__.py index 42bdf1a9a..5e56a9937 100644 --- a/spacy/language_data/__init__.py +++ b/spacy/language_data/__init__.py @@ -1 +1,2 @@ from .emoticons import * +from .punctuation import * diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py new file mode 100644 index 000000000..65d7f95e8 --- /dev/null +++ b/spacy/language_data/punctuation.py @@ -0,0 +1,133 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +TOKENIZER_PREFIXES = r''' +, +" +( +[ +{ +* +< +> +$ +£ +¡ +¿ +„ +“ +' +`` +` +# +‘ +.... +... +… +‚ +» +§ +US$ +C$ +A$ +a- +'''.strip().split('\n') + + +TOKENIZER_SUFFIXES = r''' +, +\" +\) +\] +\} +\* +\! +\? +% +\$ +> +: +; +' +” +“ +« +_ +'' +'s +'S +’s +’S +’ +‘ +° +€ +… +\.\. +\.\.\. +\.\.\.\. +(?<=[a-z0-9)\]”"'%\)])\. +(?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\. +\-\- +´ +(?<=[0-9])km² +(?<=[0-9])m² +(?<=[0-9])cm² +(?<=[0-9])mm² +(?<=[0-9])km³ +(?<=[0-9])m³ +(?<=[0-9])cm³ +(?<=[0-9])mm³ +(?<=[0-9])ha +(?<=[0-9])km +(?<=[0-9])m +(?<=[0-9])cm +(?<=[0-9])mm +(?<=[0-9])µm +(?<=[0-9])nm +(?<=[0-9])yd +(?<=[0-9])in +(?<=[0-9])ft +(?<=[0-9])kg +(?<=[0-9])g +(?<=[0-9])mg +(?<=[0-9])µg +(?<=[0-9])t +(?<=[0-9])lb +(?<=[0-9])oz +(?<=[0-9])m/s +(?<=[0-9])km/h +(?<=[0-9])mph +(?<=[0-9])°C +(?<=[0-9])°K +(?<=[0-9])°F +(?<=[0-9])hPa +(?<=[0-9])Pa +(?<=[0-9])mbar +(?<=[0-9])mb +(?<=[0-9])T +(?<=[0-9])G +(?<=[0-9])M +(?<=[0-9])K +(?<=[0-9])kb +'''.strip().split('\n') + + +TOKENIZER_INFIXES = r''' +… +\.\.\.+ +(?<=[a-z])\.(?=[A-Z]) +(?<=[a-z])\.(?=[A-Z]) +(?<=[a-zA-Z])-(?=[a-zA-z]) +(?<=[a-zA-Z])--(?=[a-zA-z]) +(?<=[0-9])-(?=[0-9]) +(?<=[A-Za-z]),(?=[A-Za-z]) +(?<=[a-zöäüßA-ZÖÄÜ"]):(?=[a-zöäüßA-ZÖÄÜ]) +(?<=[a-zöäüßA-ZÖÄÜ"])>(?=[a-zöäüßA-ZÖÄÜ]) +(?<=[a-zöäüßA-ZÖÄÜ"])<(?=[a-zöäüßA-ZÖÄÜ]) +(?<=[a-zöäüßA-ZÖÄÜ"])=(?=[a-zöäüßA-ZÖÄÜ]) +'''.strip().split('\n') + + +__all__ = [ "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES" ]