From afb22ad491cfb2486393cf319bb182b573d6e35c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 6 Feb 2024 14:14:55 +0100 Subject: [PATCH] Remove debug data normalization for span analysis (#13203) * Remove debug data normalization for span analysis As a result of this normalization, `debug data` could show a user tokens that do not exist in their data. * Update spacy/cli/debug_data.py --------- Co-authored-by: svlandeg --- spacy/cli/debug_data.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 714969be1..7a98e6d56 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -1073,8 +1073,7 @@ def _get_distribution(docs, normalize: bool = True) -> Counter: word_counts: Counter = Counter() for doc in docs: for token in doc: - # Normalize the text - t = token.text.lower().replace("``", '"').replace("''", '"') + t = token.text.lower() word_counts[t] += 1 if normalize: total = sum(word_counts.values(), 0.0)