spaCy/spacy/lang/pt/punctuation.py
Filipe Caixeta 6c498f9ff4 Update Portuguese Language (#2790)
* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols

* Extended punctuation and norm_exceptions in the Portuguese language
2018-09-29 09:51:45 +02:00

19 lines
545 B
Python

# coding: utf8
from __future__ import unicode_literals
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
from ..punctuation import TOKENIZER_SUFFIXES as BASE_TOKENIZER_SUFFIXES
from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
_prefixes = ([r'\w{1,3}\$'] + BASE_TOKENIZER_PREFIXES)
_suffixes = (BASE_TOKENIZER_SUFFIXES)
_infixes = ([r'(\w+-\w+(-\w+)*)'] +
BASE_TOKENIZER_INFIXES
)
TOKENIZER_PREFIXES = _prefixes
TOKENIZER_SUFFIXES = _suffixes
TOKENIZER_INFIXES = _infixes