mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Remove unicode declarations and update language data
This commit is contained in:
parent
ba600f91c5
commit
df0b68f60e
|
@ -1,7 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
>>> from spacy.lang.cs.examples import sentences
|
>>> from spacy.lang.cs.examples import sentences
|
||||||
|
@ -10,9 +6,9 @@ Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
||||||
sentences = [
|
sentences = [
|
||||||
"Máma mele maso.",
|
"Máma mele maso.",
|
||||||
"Příliš žluťoučký kůň úpěl ďábelské ódy.",
|
"Příliš žluťoučký kůň úpěl ďábelské ódy.",
|
||||||
"ArcGIS je geografický informační systém určený pro práci s prostorovými daty." ,
|
"ArcGIS je geografický informační systém určený pro práci s prostorovými daty.",
|
||||||
"Může data vytvářet a spravovat, ale především je dokáže analyzovat, najít v nich nové vztahy a vše přehledně vizualizovat.",
|
"Může data vytvářet a spravovat, ale především je dokáže analyzovat, najít v nich nové vztahy a vše přehledně vizualizovat.",
|
||||||
"Dnes je krásné počasí.",
|
"Dnes je krásné počasí.",
|
||||||
"Nestihl autobus, protože pozdě vstal z postele.",
|
"Nestihl autobus, protože pozdě vstal z postele.",
|
||||||
|
@ -39,4 +35,4 @@ sentences = [
|
||||||
"Jaké PSČ má Praha 1?",
|
"Jaké PSČ má Praha 1?",
|
||||||
"PSČ Prahy 1 je 110 00.",
|
"PSČ Prahy 1 je 110 00.",
|
||||||
"Za 20 minut jede vlak.",
|
"Za 20 minut jede vlak.",
|
||||||
]
|
]
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
_num_words = [
|
_num_words = [
|
||||||
|
@ -43,7 +40,7 @@ _num_words = [
|
||||||
"kvadrilion",
|
"kvadrilion",
|
||||||
"kvadriliarda",
|
"kvadriliarda",
|
||||||
"kvintilion",
|
"kvintilion",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
_num_words = [
|
_num_words = [
|
||||||
|
@ -73,6 +70,7 @@ _ordinal_words = [
|
||||||
"עשירי",
|
"עשירי",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
if text.startswith(("+", "-", "±", "~")):
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
|
@ -84,7 +82,7 @@ def like_num(text):
|
||||||
num, denom = text.split("/")
|
num, denom = text.split("/")
|
||||||
if num.isdigit() and denom.isdigit():
|
if num.isdigit() and denom.isdigit():
|
||||||
return True
|
return True
|
||||||
|
|
||||||
if text in _num_words:
|
if text in _num_words:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
# Source: https://github.com/sanjaalcorps/NepaliStopWords/blob/master/NepaliStopWords.txt
|
# Source: https://github.com/sanjaalcorps/NepaliStopWords/blob/master/NepaliStopWords.txt
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
|
|
|
@ -1,18 +1,10 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
|
||||||
|
|
||||||
|
|
||||||
class SanskritDefaults(Language.Defaults):
|
class SanskritDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = LEX_ATTRS
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "sa"
|
|
||||||
|
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,5 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
# reference 1: https://en.wikibooks.org/wiki/Sanskrit/Numbers
|
# reference 1: https://en.wikibooks.org/wiki/Sanskrit/Numbers
|
||||||
|
|
||||||
_num_words = [
|
_num_words = [
|
||||||
|
@ -106,26 +102,26 @@ _num_words = [
|
||||||
"सप्तनवतिः",
|
"सप्तनवतिः",
|
||||||
"अष्टनवतिः",
|
"अष्टनवतिः",
|
||||||
"एकोनशतम्",
|
"एकोनशतम्",
|
||||||
"शतम्"
|
"शतम्",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
"""
|
"""
|
||||||
Check if text resembles a number
|
Check if text resembles a number
|
||||||
"""
|
"""
|
||||||
if text.startswith(("+", "-", "±", "~")):
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
text = text.replace(",", "").replace(".", "")
|
text = text.replace(",", "").replace(".", "")
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
if text.count("/") == 1:
|
if text.count("/") == 1:
|
||||||
num, denom = text.split("/")
|
num, denom = text.split("/")
|
||||||
if num.isdigit() and denom.isdigit():
|
if num.isdigit() and denom.isdigit():
|
||||||
return True
|
return True
|
||||||
if text in _num_words:
|
if text in _num_words:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
# Source: https://gist.github.com/Akhilesh28/fe8b8e180f64b72e64751bc31cb6d323
|
# Source: https://gist.github.com/Akhilesh28/fe8b8e180f64b72e64751bc31cb6d323
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,15 +1,13 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.tokens import Span
|
from spacy.tokens import Span
|
||||||
from spacy import displacy
|
from spacy import displacy
|
||||||
|
|
||||||
SAMPLE_TEXT = '''First line
|
|
||||||
|
SAMPLE_TEXT = """First line
|
||||||
Second line, with ent
|
Second line, with ent
|
||||||
Third line
|
Third line
|
||||||
Fourth line
|
Fourth line
|
||||||
'''
|
"""
|
||||||
|
|
||||||
|
|
||||||
def test_issue5838():
|
def test_issue5838():
|
||||||
|
@ -18,8 +16,8 @@ def test_issue5838():
|
||||||
|
|
||||||
nlp = English()
|
nlp = English()
|
||||||
doc = nlp(SAMPLE_TEXT)
|
doc = nlp(SAMPLE_TEXT)
|
||||||
doc.ents = [Span(doc, 7, 8, label='test')]
|
doc.ents = [Span(doc, 7, 8, label="test")]
|
||||||
|
|
||||||
html = displacy.render(doc, style='ent')
|
html = displacy.render(doc, style="ent")
|
||||||
found = html.count('</br>')
|
found = html.count("</br>")
|
||||||
assert found == 4
|
assert found == 4
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.pipeline import merge_entities, EntityRuler
|
from spacy.pipeline import merge_entities, EntityRuler
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user