File size: 3,130 Bytes
2e99c77
0210dff
2e99c77
 
 
 
 
 
 
 
 
 
 
0210dff
 
 
2e99c77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0210dff
 
2e99c77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79

def split_sentence(sentence, max_text_length=180, delimiters=",;-!?"):
    """
    Splits a sentence into two halves, prioritizing the delimiter closest to the middle.
    If no delimiter is found, it ensures words are not split in the middle.

    Args:
        sentence (str): The input sentence to split.
        delimiters (str): A string of delimiters to prioritize for splitting (default: ",;!?").

    Returns:
        tuple: A tuple containing the two halves of the sentence.
    """
    if len(sentence) < max_text_length:
        return [sentence]

    # Find all delimiter indices in the sentence
    delimiter_indices = [i for i, char in enumerate(sentence) if char in delimiters]

    if delimiter_indices:
        # Calculate the midpoint of the sentence
        midpoint = len(sentence) // 2

        # Find the delimiter closest to the midpoint
        closest_delimiter = min(delimiter_indices, key=lambda x: abs(x - midpoint))

        # Split at the closest delimiter
        first_half = sentence[:closest_delimiter].strip()
        second_half = sentence[closest_delimiter + 1:].strip()
    else:
        # If no delimiter, split at the nearest space (word boundary)
        midpoint = len(sentence) // 2

        # Find the nearest space (word boundary) around the midpoint
        left_space = sentence.rfind(" ", 0, midpoint)
        right_space = sentence.find(" ", midpoint)

        # Choose the closest space to the midpoint
        if left_space == -1 and right_space == -1:
            # No spaces found (single word), split at midpoint
            split_index = midpoint
        elif left_space == -1:
            # Only right space found
            split_index = right_space
        elif right_space == -1:
            # Only left space found
            split_index = left_space
        else:
            # Choose the closest space to the midpoint
            split_index = left_space if (midpoint - left_space) <= (right_space - midpoint) else right_space

        # Split the sentence into two parts
        first_half = sentence[:split_index].strip()
        second_half = sentence[split_index:].strip()

    return split_sentence(first_half, max_text_length=max_text_length) \
        + split_sentence(second_half, max_text_length=max_text_length)


def merge_sentences(sentences):
    """ handling short sentences by merging them to next/prev ones """
    merged_sentences = []
    i = 0
    while i < len(sentences): 
        s = sentences[i]
        word_count = len(s.split())
        j = 1
        # merge the short sentence to the next one until long enough
        while word_count <= 6 and i+j < len(sentences):
            s += ' ' + sentences[i+j]
            word_count = len(s.split())
            j += 1
        merged_sentences.append(s)
        i += j
    # merge the last one to the prev one until long enough
    while len(merged_sentences) > 1 and len(merged_sentences[len(merged_sentences) - 1].split()) < 6:
        merged_sentences[len(merged_sentences) - 2] += ' ' + merged_sentences[len(merged_sentences) - 1]
        merged_sentences.pop()
    return merged_sentences