mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			66 lines
		
	
	
		
			1.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			66 lines
		
	
	
		
			1.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import re
 | 
						|
import os
 | 
						|
from os import path
 | 
						|
 | 
						|
 | 
						|
def parse(sent_text, strip_bad_periods=False):
 | 
						|
    sent_text = sent_text.strip()
 | 
						|
    assert sent_text and sent_text.startswith('(')
 | 
						|
    open_brackets = []
 | 
						|
    brackets = []
 | 
						|
    bracketsRE = re.compile(r'(\()([^\s\)\(]+)|([^\s\)\(]+)?(\))')
 | 
						|
    word_i = 0
 | 
						|
    words = []
 | 
						|
    # Remove outermost bracket
 | 
						|
    if sent_text.startswith('(('):
 | 
						|
        sent_text = sent_text.replace('((', '( (', 1)
 | 
						|
    for match in bracketsRE.finditer(sent_text[2:-1]):
 | 
						|
        open_, label, text, close = match.groups()
 | 
						|
        if open_:
 | 
						|
            assert not close
 | 
						|
            assert label.strip()
 | 
						|
            open_brackets.append((label, word_i))
 | 
						|
        else:
 | 
						|
            assert close
 | 
						|
            label, start = open_brackets.pop()
 | 
						|
            assert label.strip()
 | 
						|
            if strip_bad_periods and words and _is_bad_period(words[-1], text):
 | 
						|
                continue
 | 
						|
            # Traces leave 0-width bracket, but no token
 | 
						|
            if text and label != '-NONE-':
 | 
						|
                words.append(text)
 | 
						|
                word_i += 1
 | 
						|
            else:
 | 
						|
                brackets.append((label, start, word_i))
 | 
						|
    return words, brackets
 | 
						|
 | 
						|
 | 
						|
def _is_bad_period(prev, period):
 | 
						|
    if period != '.':
 | 
						|
        return False
 | 
						|
    elif prev == '.':
 | 
						|
        return False
 | 
						|
    elif not prev.endswith('.'):
 | 
						|
        return False
 | 
						|
    else:
 | 
						|
        return True
 | 
						|
 | 
						|
 | 
						|
def split(text):
 | 
						|
    sentences = []
 | 
						|
    current = []
 | 
						|
 | 
						|
    for line in text.strip().split('\n'):
 | 
						|
        line = line.rstrip()
 | 
						|
        if not line:
 | 
						|
            continue
 | 
						|
        # Detect the start of sentences by line starting with (
 | 
						|
        # This is messy, but it keeps bracket parsing at the sentence level
 | 
						|
        if line.startswith('(') and current:
 | 
						|
            sentences.append('\n'.join(current))
 | 
						|
            current = []
 | 
						|
        current.append(line)
 | 
						|
    if current:
 | 
						|
        sentences.append('\n'.join(current))
 | 
						|
    return sentences
 |