Remove unicode declarations and update language data

This commit is contained in:
Ines Montani 2020-09-04 13:19:16 +02:00
parent ba600f91c5
commit df0b68f60e
13 changed files with 27 additions and 73 deletions

View File

@ -1,7 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.
>>> from spacy.lang.cs.examples import sentences >>> from spacy.lang.cs.examples import sentences
@ -10,9 +6,9 @@ Example sentences to test spaCy and its language models.
sentences = [ sentences = [
"Máma mele maso.", "Máma mele maso.",
"Příliš žluťoučký kůň úpěl ďábelské ódy.", "Příliš žluťoučký kůň úpěl ďábelské ódy.",
"ArcGIS je geografický informační systém určený pro práci s prostorovými daty." , "ArcGIS je geografický informační systém určený pro práci s prostorovými daty.",
"Může data vytvářet a spravovat, ale především je dokáže analyzovat, najít v nich nové vztahy a vše přehledně vizualizovat.", "Může data vytvářet a spravovat, ale především je dokáže analyzovat, najít v nich nové vztahy a vše přehledně vizualizovat.",
"Dnes je krásné počasí.", "Dnes je krásné počasí.",
"Nestihl autobus, protože pozdě vstal z postele.", "Nestihl autobus, protože pozdě vstal z postele.",
@ -39,4 +35,4 @@ sentences = [
"Jaké PSČ má Praha 1?", "Jaké PSČ má Praha 1?",
"PSČ Prahy 1 je 110 00.", "PSČ Prahy 1 je 110 00.",
"Za 20 minut jede vlak.", "Za 20 minut jede vlak.",
] ]

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM from ...attrs import LIKE_NUM
_num_words = [ _num_words = [
@ -43,7 +40,7 @@ _num_words = [
"kvadrilion", "kvadrilion",
"kvadriliarda", "kvadriliarda",
"kvintilion", "kvintilion",
] ]
def like_num(text): def like_num(text):

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM from ...attrs import LIKE_NUM
_num_words = [ _num_words = [
@ -73,6 +70,7 @@ _ordinal_words = [
"עשירי", "עשירי",
] ]
def like_num(text): def like_num(text):
if text.startswith(("+", "-", "±", "~")): if text.startswith(("+", "-", "±", "~")):
text = text[1:] text = text[1:]
@ -84,7 +82,7 @@ def like_num(text):
num, denom = text.split("/") num, denom = text.split("/")
if num.isdigit() and denom.isdigit(): if num.isdigit() and denom.isdigit():
return True return True
if text in _num_words: if text in _num_words:
return True return True

View File

@ -1,7 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
# Source: https://github.com/sanjaalcorps/NepaliStopWords/blob/master/NepaliStopWords.txt # Source: https://github.com/sanjaalcorps/NepaliStopWords/blob/master/NepaliStopWords.txt
STOP_WORDS = set( STOP_WORDS = set(

View File

@ -1,18 +1,10 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...attrs import LANG
class SanskritDefaults(Language.Defaults): class SanskritDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = LEX_ATTRS
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "sa"
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -1,7 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,9 +1,5 @@
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM from ...attrs import LIKE_NUM
# reference 1: https://en.wikibooks.org/wiki/Sanskrit/Numbers # reference 1: https://en.wikibooks.org/wiki/Sanskrit/Numbers
_num_words = [ _num_words = [
@ -106,26 +102,26 @@ _num_words = [
"सप्तनवतिः", "सप्तनवतिः",
"अष्टनवतिः", "अष्टनवतिः",
"एकोनशतम्", "एकोनशतम्",
"शतम्" "शतम्",
] ]
def like_num(text): def like_num(text):
""" """
Check if text resembles a number Check if text resembles a number
""" """
if text.startswith(("+", "-", "±", "~")): if text.startswith(("+", "-", "±", "~")):
text = text[1:] text = text[1:]
text = text.replace(",", "").replace(".", "") text = text.replace(",", "").replace(".", "")
if text.isdigit(): if text.isdigit():
return True return True
if text.count("/") == 1: if text.count("/") == 1:
num, denom = text.split("/") num, denom = text.split("/")
if num.isdigit() and denom.isdigit(): if num.isdigit() and denom.isdigit():
return True return True
if text in _num_words: if text in _num_words:
return True return True
return False return False
LEX_ATTRS = {LIKE_NUM: like_num} LEX_ATTRS = {LIKE_NUM: like_num}

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
# Source: https://gist.github.com/Akhilesh28/fe8b8e180f64b72e64751bc31cb6d323 # Source: https://gist.github.com/Akhilesh28/fe8b8e180f64b72e64751bc31cb6d323
STOP_WORDS = set( STOP_WORDS = set(

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest import pytest

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest import pytest

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest import pytest

View File

@ -1,15 +1,13 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.en import English from spacy.lang.en import English
from spacy.tokens import Span from spacy.tokens import Span
from spacy import displacy from spacy import displacy
SAMPLE_TEXT = '''First line
SAMPLE_TEXT = """First line
Second line, with ent Second line, with ent
Third line Third line
Fourth line Fourth line
''' """
def test_issue5838(): def test_issue5838():
@ -18,8 +16,8 @@ def test_issue5838():
nlp = English() nlp = English()
doc = nlp(SAMPLE_TEXT) doc = nlp(SAMPLE_TEXT)
doc.ents = [Span(doc, 7, 8, label='test')] doc.ents = [Span(doc, 7, 8, label="test")]
html = displacy.render(doc, style='ent') html = displacy.render(doc, style="ent")
found = html.count('</br>') found = html.count("</br>")
assert found == 4 assert found == 4

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.en import English from spacy.lang.en import English
from spacy.pipeline import merge_entities, EntityRuler from spacy.pipeline import merge_entities, EntityRuler