Spaces:
Sleeping
Sleeping
def split_sentence(sentence, max_text_length=180, delimiters=",;-!?"): | |
""" | |
Splits a sentence into two halves, prioritizing the delimiter closest to the middle. | |
If no delimiter is found, it ensures words are not split in the middle. | |
Args: | |
sentence (str): The input sentence to split. | |
delimiters (str): A string of delimiters to prioritize for splitting (default: ",;!?"). | |
Returns: | |
tuple: A tuple containing the two halves of the sentence. | |
""" | |
if len(sentence) < max_text_length: | |
return [sentence] | |
# Find all delimiter indices in the sentence | |
delimiter_indices = [i for i, char in enumerate(sentence) if char in delimiters] | |
if delimiter_indices: | |
# Calculate the midpoint of the sentence | |
midpoint = len(sentence) // 2 | |
# Find the delimiter closest to the midpoint | |
closest_delimiter = min(delimiter_indices, key=lambda x: abs(x - midpoint)) | |
# Split at the closest delimiter | |
first_half = sentence[:closest_delimiter].strip() | |
second_half = sentence[closest_delimiter + 1:].strip() | |
else: | |
# If no delimiter, split at the nearest space (word boundary) | |
midpoint = len(sentence) // 2 | |
# Find the nearest space (word boundary) around the midpoint | |
left_space = sentence.rfind(" ", 0, midpoint) | |
right_space = sentence.find(" ", midpoint) | |
# Choose the closest space to the midpoint | |
if left_space == -1 and right_space == -1: | |
# No spaces found (single word), split at midpoint | |
split_index = midpoint | |
elif left_space == -1: | |
# Only right space found | |
split_index = right_space | |
elif right_space == -1: | |
# Only left space found | |
split_index = left_space | |
else: | |
# Choose the closest space to the midpoint | |
split_index = left_space if (midpoint - left_space) <= (right_space - midpoint) else right_space | |
# Split the sentence into two parts | |
first_half = sentence[:split_index].strip() | |
second_half = sentence[split_index:].strip() | |
return split_sentence(first_half, max_text_length=max_text_length) \ | |
+ split_sentence(second_half, max_text_length=max_text_length) | |
def merge_sentences(sentences): | |
""" handling short sentences by merging them to next/prev ones """ | |
merged_sentences = [] | |
i = 0 | |
while i < len(sentences): | |
s = sentences[i] | |
word_count = len(s.split()) | |
j = 1 | |
# merge the short sentence to the next one until long enough | |
while word_count <= 6 and i+j < len(sentences): | |
s += ' ' + sentences[i+j] | |
word_count = len(s.split()) | |
j += 1 | |
merged_sentences.append(s) | |
i += j | |
# merge the last one to the prev one until long enough | |
while len(merged_sentences) > 1 and len(merged_sentences[len(merged_sentences) - 1].split()) < 6: | |
merged_sentences[len(merged_sentences) - 2] += ' ' + merged_sentences[len(merged_sentences) - 1] | |
merged_sentences.pop() | |
return merged_sentences |