mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 20:28:20 +03:00
6942a6a69b
Most of these characters are for languages / writing systems that aren't supported by spacy, but I don't think it causes problems to include them. In the UD evals, Hindi and Urdu improve a lot as expected (from 0-10% to 70-80%) and Persian improves a little (90% to 96%). Tamil improves in combination with #4288. The punctuation list is converted to a set internally because of its increased length. Sentence final punctuation generated with: ``` unichars -gas '[\p{Sentence_Break=STerm}\p{Sentence_Break=ATerm}]' '\p{Terminal_Punctuation}' ``` See: https://stackoverflow.com/a/9508766/461847 Fixes #4269.
88 lines
2.9 KiB
Python
88 lines
2.9 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
import pytest
|
|
from spacy.pipeline import Sentencizer
|
|
from spacy.tokens import Doc
|
|
|
|
|
|
def test_sentencizer(en_vocab):
|
|
doc = Doc(en_vocab, words=["Hello", "!", "This", "is", "a", "test", "."])
|
|
sentencizer = Sentencizer()
|
|
doc = sentencizer(doc)
|
|
assert doc.is_sentenced
|
|
sent_starts = [t.is_sent_start for t in doc]
|
|
assert sent_starts == [True, False, True, False, False, False, False]
|
|
assert len(list(doc.sents)) == 2
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"words,sent_starts,n_sents",
|
|
[
|
|
# The expected result here is that the duplicate punctuation gets merged
|
|
# onto the same sentence and no one-token sentence is created for them.
|
|
(
|
|
["Hello", "!", ".", "Test", ".", ".", "ok"],
|
|
[True, False, False, True, False, False, True],
|
|
3,
|
|
),
|
|
# We also want to make sure ¡ and ¿ aren't treated as sentence end
|
|
# markers, even though they're punctuation
|
|
(
|
|
["¡", "Buen", "día", "!", "Hola", ",", "¿", "qué", "tal", "?"],
|
|
[True, False, False, False, True, False, False, False, False, False],
|
|
2,
|
|
),
|
|
# The Token.is_punct check ensures that quotes are handled as well
|
|
(
|
|
['"', "Nice", "!", '"', "I", "am", "happy", "."],
|
|
[True, False, False, False, True, False, False, False],
|
|
2,
|
|
),
|
|
],
|
|
)
|
|
def test_sentencizer_complex(en_vocab, words, sent_starts, n_sents):
|
|
doc = Doc(en_vocab, words=words)
|
|
sentencizer = Sentencizer()
|
|
doc = sentencizer(doc)
|
|
assert doc.is_sentenced
|
|
assert [t.is_sent_start for t in doc] == sent_starts
|
|
assert len(list(doc.sents)) == n_sents
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"punct_chars,words,sent_starts,n_sents",
|
|
[
|
|
(
|
|
["~", "?"],
|
|
["Hello", "world", "~", "A", ".", "B", "."],
|
|
[True, False, False, True, False, False, False],
|
|
2,
|
|
),
|
|
# Even thought it's not common, the punct_chars should be able to
|
|
# handle any tokens
|
|
(
|
|
[".", "ö"],
|
|
["Hello", ".", "Test", "ö", "Ok", "."],
|
|
[True, False, True, False, True, False],
|
|
3,
|
|
),
|
|
],
|
|
)
|
|
def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, n_sents):
|
|
doc = Doc(en_vocab, words=words)
|
|
sentencizer = Sentencizer(punct_chars=punct_chars)
|
|
doc = sentencizer(doc)
|
|
assert doc.is_sentenced
|
|
assert [t.is_sent_start for t in doc] == sent_starts
|
|
assert len(list(doc.sents)) == n_sents
|
|
|
|
|
|
def test_sentencizer_serialize_bytes(en_vocab):
|
|
punct_chars = [".", "~", "+"]
|
|
sentencizer = Sentencizer(punct_chars=punct_chars)
|
|
assert sentencizer.punct_chars == set(punct_chars)
|
|
bytes_data = sentencizer.to_bytes()
|
|
new_sentencizer = Sentencizer().from_bytes(bytes_data)
|
|
assert new_sentencizer.punct_chars == set(punct_chars)
|