mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-10 16:22:29 +03:00
Fix training/. Ignore trailing whitespaces and too long lines.
This commit is contained in:
parent
50dac51dc8
commit
9de2257e7a
2
.github/workflows/tests.yml
vendored
2
.github/workflows/tests.yml
vendored
|
@ -47,7 +47,7 @@ jobs:
|
||||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||||
- name: cython-lint
|
- name: cython-lint
|
||||||
run: |
|
run: |
|
||||||
python -m pip install cython-lint -c requirements.txt
|
python -m pip install cython-lint -c requirements.txt --ignore E501,W291
|
||||||
cython-lint spacy
|
cython-lint spacy
|
||||||
|
|
||||||
tests:
|
tests:
|
||||||
|
|
|
@ -37,10 +37,14 @@ def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[Li
|
||||||
b2a.append(set())
|
b2a.append(set())
|
||||||
# Process the alignment at the current position
|
# Process the alignment at the current position
|
||||||
if A[token_idx_a] == B[token_idx_b] and \
|
if A[token_idx_a] == B[token_idx_b] and \
|
||||||
(char_idx_a == 0 or \
|
(
|
||||||
char_to_token_a[char_idx_a - 1] < token_idx_a) and \
|
char_idx_a == 0 or
|
||||||
(char_idx_b == 0 or \
|
char_to_token_a[char_idx_a - 1] < token_idx_a
|
||||||
char_to_token_b[char_idx_b - 1] < token_idx_b):
|
) and \
|
||||||
|
(
|
||||||
|
char_idx_b == 0 or
|
||||||
|
char_to_token_b[char_idx_b - 1] < token_idx_b
|
||||||
|
):
|
||||||
# Current tokens are identical and both character offsets are the
|
# Current tokens are identical and both character offsets are the
|
||||||
# start of a token (either at the beginning of the document or the
|
# start of a token (either at the beginning of the document or the
|
||||||
# previous character belongs to a different token)
|
# previous character belongs to a different token)
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import warnings
|
|
||||||
from collections.abc import Iterable as IterableInstance
|
from collections.abc import Iterable as IterableInstance
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
|
@ -31,9 +30,9 @@ cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
|
||||||
attrs, array = _annot2array(vocab, tok_annot, doc_annot)
|
attrs, array = _annot2array(vocab, tok_annot, doc_annot)
|
||||||
output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
|
output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
|
||||||
if "entities" in doc_annot:
|
if "entities" in doc_annot:
|
||||||
_add_entities_to_doc(output, doc_annot["entities"])
|
_add_entities_to_doc(output, doc_annot["entities"])
|
||||||
if "spans" in doc_annot:
|
if "spans" in doc_annot:
|
||||||
_add_spans_to_doc(output, doc_annot["spans"])
|
_add_spans_to_doc(output, doc_annot["spans"])
|
||||||
if array.size:
|
if array.size:
|
||||||
output = output.from_array(attrs, array)
|
output = output.from_array(attrs, array)
|
||||||
# links are currently added with ENT_KB_ID on the token level
|
# links are currently added with ENT_KB_ID on the token level
|
||||||
|
@ -161,7 +160,6 @@ cdef class Example:
|
||||||
self._y_sig = y_sig
|
self._y_sig = y_sig
|
||||||
return self._cached_alignment
|
return self._cached_alignment
|
||||||
|
|
||||||
|
|
||||||
def _get_aligned_vectorized(self, align, gold_values):
|
def _get_aligned_vectorized(self, align, gold_values):
|
||||||
# Fast path for Doc attributes/fields that are predominantly a single value,
|
# Fast path for Doc attributes/fields that are predominantly a single value,
|
||||||
# i.e., TAG, POS, MORPH.
|
# i.e., TAG, POS, MORPH.
|
||||||
|
@ -204,7 +202,6 @@ cdef class Example:
|
||||||
|
|
||||||
return output.tolist()
|
return output.tolist()
|
||||||
|
|
||||||
|
|
||||||
def _get_aligned_non_vectorized(self, align, gold_values):
|
def _get_aligned_non_vectorized(self, align, gold_values):
|
||||||
# Slower path for fields that return multiple values (resulting
|
# Slower path for fields that return multiple values (resulting
|
||||||
# in ragged arrays that cannot be vectorized trivially).
|
# in ragged arrays that cannot be vectorized trivially).
|
||||||
|
@ -221,7 +218,6 @@ cdef class Example:
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
def get_aligned(self, field, as_string=False):
|
def get_aligned(self, field, as_string=False):
|
||||||
"""Return an aligned array for a token attribute."""
|
"""Return an aligned array for a token attribute."""
|
||||||
align = self.alignment.x2y
|
align = self.alignment.x2y
|
||||||
|
@ -330,7 +326,7 @@ cdef class Example:
|
||||||
missing=None
|
missing=None
|
||||||
)
|
)
|
||||||
# Now fill the tokens we can align to O.
|
# Now fill the tokens we can align to O.
|
||||||
O = 2 # I=1, O=2, B=3
|
O = 2 # I=1, O=2, B=3 # no-cython-lint: E741
|
||||||
for i, ent_iob in enumerate(self.get_aligned("ENT_IOB")):
|
for i, ent_iob in enumerate(self.get_aligned("ENT_IOB")):
|
||||||
if x_tags[i] is None:
|
if x_tags[i] is None:
|
||||||
if ent_iob == O:
|
if ent_iob == O:
|
||||||
|
@ -340,7 +336,7 @@ cdef class Example:
|
||||||
return x_ents, x_tags
|
return x_ents, x_tags
|
||||||
|
|
||||||
def get_aligned_ner(self):
|
def get_aligned_ner(self):
|
||||||
x_ents, x_tags = self.get_aligned_ents_and_ner()
|
_x_ents, x_tags = self.get_aligned_ents_and_ner()
|
||||||
return x_tags
|
return x_tags
|
||||||
|
|
||||||
def get_matching_ents(self, check_label=True):
|
def get_matching_ents(self, check_label=True):
|
||||||
|
@ -398,7 +394,6 @@ cdef class Example:
|
||||||
|
|
||||||
return span_dict
|
return span_dict
|
||||||
|
|
||||||
|
|
||||||
def _links_to_dict(self):
|
def _links_to_dict(self):
|
||||||
links = {}
|
links = {}
|
||||||
for ent in self.reference.ents:
|
for ent in self.reference.ents:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user