Commit
·
53e37f7
1
Parent(s):
b448ba9
add models and config
Browse files- .DS_Store +0 -0
- config.json +6 -6
- configuration_lang.py → configuration_ocrqa.py +1 -1
- modeling_lang.py → modeling_ocrqa.py +1 -1
- models/ocrqa-wp_v1.0.5-de.bloom.log +31 -0
- models/ocrqa-wp_v1.0.5-en.bloom.log +28 -0
- models/ocrqa-wp_v1.0.5-fr.bloom.log +33 -0
- models/ocrqa-wp_v1.0.5-lb.bloom.log +45 -0
- models/ocrqa-wp_v1.0.6-de.bloom.log +246 -0
- models/ocrqa-wp_v1.0.6-fr.bloom.log +52 -0
- lang_detect.py → ocr_qa_assessment.py +1 -1
- test.py +0 -13
- test_floret.py +0 -10
.DS_Store
CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
|
|
config.json
CHANGED
@@ -1,21 +1,21 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "
|
3 |
"architectures": [
|
4 |
-
"
|
5 |
],
|
6 |
"filename": "LID-40-3-2000000-1-4.bin",
|
7 |
"attention_probs_dropout_prob": 0.1,
|
8 |
"auto_map": {
|
9 |
-
"AutoConfig": "
|
10 |
-
"AutoModelForTokenClassification": "
|
11 |
},
|
12 |
"custom_pipelines": {
|
13 |
"lang-detect": {
|
14 |
-
"impl": "
|
15 |
"pt": "AutoModelForTokenClassification"
|
16 |
}
|
17 |
},
|
18 |
-
"model_type": "
|
19 |
"torch_dtype": "float32",
|
20 |
"transformers_version": "4.49.0"
|
21 |
}
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "impresso-project/ocr-quality-assessment-light",
|
3 |
"architectures": [
|
4 |
+
"QAAssessmentModel"
|
5 |
],
|
6 |
"filename": "LID-40-3-2000000-1-4.bin",
|
7 |
"attention_probs_dropout_prob": 0.1,
|
8 |
"auto_map": {
|
9 |
+
"AutoConfig": "configuration_ocrqa.ImpressoConfig",
|
10 |
+
"AutoModelForTokenClassification": "modeling_ocrqa.QAAssessmentModel"
|
11 |
},
|
12 |
"custom_pipelines": {
|
13 |
"lang-detect": {
|
14 |
+
"impl": "ocr_qa_assessment.QAAssessmentPipeline",
|
15 |
"pt": "AutoModelForTokenClassification"
|
16 |
}
|
17 |
},
|
18 |
+
"model_type": "ocr_qa_assessment",
|
19 |
"torch_dtype": "float32",
|
20 |
"transformers_version": "4.49.0"
|
21 |
}
|
configuration_lang.py → configuration_ocrqa.py
RENAMED
@@ -3,7 +3,7 @@ import torch
|
|
3 |
|
4 |
|
5 |
class ImpressoConfig(PretrainedConfig):
|
6 |
-
model_type = "
|
7 |
|
8 |
def __init__(
|
9 |
self,
|
|
|
3 |
|
4 |
|
5 |
class ImpressoConfig(PretrainedConfig):
|
6 |
+
model_type = "ocr_qa_assessment"
|
7 |
|
8 |
def __init__(
|
9 |
self,
|
modeling_lang.py → modeling_ocrqa.py
RENAMED
@@ -10,7 +10,7 @@ from .configuration_lang import ImpressoConfig
|
|
10 |
logger = logging.getLogger(__name__)
|
11 |
|
12 |
|
13 |
-
class
|
14 |
config_class = ImpressoConfig
|
15 |
|
16 |
def __init__(self, config):
|
|
|
10 |
logger = logging.getLogger(__name__)
|
11 |
|
12 |
|
13 |
+
class QAAssessmentModel(PreTrainedModel):
|
14 |
config_class = ImpressoConfig
|
15 |
|
16 |
def __init__(self, config):
|
models/ocrqa-wp_v1.0.5-de.bloom.log
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-02-10 09:54:25,418 ocrqa_create_bloom_filter.py:425 INFO: Namespace(input_files=['lex/de/realwords.rw.txt.bz2', 'lex/de/dewiki.unigram.freq.tsv.bz2', 'lex/de/nonwords.nw.txt.bz2'], bloom_path='build.d/fp_prob_0.00001/ocrqa-wp_v1.0.5-de.bloom', fp_probability=1e-05, log_level='INFO', log_file='build.d/fp_prob_0.00001/ocrqa-wp_v1.0.5-de.bloom.log', config=None, min_frequency=2, single_char_min_frequency=20, diagnose_bloom=True)
|
2 |
+
2025-02-10 09:54:25,418 ocrqa_create_bloom_filter.py:226 INFO: Starting Bloom Filter creation...
|
3 |
+
2025-02-10 09:54:25,418 ocrqa_create_bloom_filter.py:178 INFO: Processing nonword file: lex/de/nonwords.nw.txt.bz2
|
4 |
+
2025-02-10 09:54:25,418 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['negierung']
|
5 |
+
2025-02-10 09:54:25,418 ocrqa_create_bloom_filter.py:190 INFO: Excluded 1 words that should never be added
|
6 |
+
2025-02-10 09:54:25,418 ocrqa_create_bloom_filter.py:196 INFO: Processing real-word file: lex/de/realwords.rw.txt.bz2
|
7 |
+
2025-02-10 09:54:25,418 ocrqa_create_bloom_filter.py:135 INFO: Processing frequency file: lex/de/dewiki.unigram.freq.tsv.bz2
|
8 |
+
2025-02-10 09:54:37,205 ocrqa_create_bloom_filter.py:240 INFO: low_freq_excluded before removing parts from high-frequency words: 3780824
|
9 |
+
2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:248 INFO: low_freq_excluded after removing parts from high-frequency words: 3288865
|
10 |
+
2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:252 INFO: Lexical processing complete.
|
11 |
+
2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:256 INFO: - nonwords_read: 1
|
12 |
+
2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:256 INFO: - nonwords_count: 1
|
13 |
+
2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:256 INFO: - realwords_read: 8
|
14 |
+
2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:256 INFO: - realwords_accepted: 8
|
15 |
+
2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:256 INFO: - realwords_nonwords_filtered: 0
|
16 |
+
2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:256 INFO: - freq_words_read: 9199714
|
17 |
+
2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:256 INFO: - words_accepted: 4143119
|
18 |
+
2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:256 INFO: - subwords_accepted: 5029202
|
19 |
+
2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:256 INFO: - subwords_filtered: 1
|
20 |
+
2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:256 INFO: - low_freq_excluded: 3288865
|
21 |
+
2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:256 INFO: - single_char_words_filtered: 6719
|
22 |
+
2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:256 INFO: - freq_words_filtered: 5049876
|
23 |
+
2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:259 INFO: Estimated word count: 3357976
|
24 |
+
2025-02-10 09:54:38,245 ocrqa_create_bloom_filter.py:263 INFO: Bloom Filter created and saved to build.d/fp_prob_0.00001/ocrqa-wp_v1.0.5-de.bloom
|
25 |
+
2025-02-10 09:54:38,860 ocrqa_create_bloom_filter.py:285 INFO: Diagnosis Results:
|
26 |
+
2025-02-10 09:54:38,860 ocrqa_create_bloom_filter.py:286 INFO: - Excluded words in bloom filter: 0
|
27 |
+
2025-02-10 09:54:38,860 ocrqa_create_bloom_filter.py:287 INFO: - Known words not in bloom filter: 0
|
28 |
+
2025-02-10 09:54:39,386 ocrqa_create_bloom_filter.py:294 INFO: - Low-frequency words in bloom filter: 33
|
29 |
+
2025-02-10 09:54:39,386 ocrqa_create_bloom_filter.py:300 INFO: - Proportion of excluded words in bloom filter: 0.00000000
|
30 |
+
2025-02-10 09:54:39,386 ocrqa_create_bloom_filter.py:306 INFO: - Proportion of known words not in bloom filter: 0.00000000
|
31 |
+
2025-02-10 09:54:39,386 ocrqa_create_bloom_filter.py:314 INFO: - Proportion of low-frequency words in bloom filter: 0.00001003
|
models/ocrqa-wp_v1.0.5-en.bloom.log
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-03-01 01:26:13,631 ocrqa_create_bloom_filter.py:121 INFO: Git version: 5ad02a4
|
2 |
+
2025-03-01 01:26:13,631 ocrqa_create_bloom_filter.py:434 INFO: Namespace(input_files=['lex/en/modern_spelling.rw.txt', 'lex/en/enwiki.unigram.freq.tsv.bz2'], bloom_path='build.d/fp_prob_0.00001/ocrqa-wp_v1.0.5-en.bloom', fp_probability=1e-05, log_level='INFO', log_file='build.d/fp_prob_0.00001/ocrqa-wp_v1.0.5-en.bloom.log', config=None, min_frequency=5, single_char_min_frequency=20, diagnose_bloom=True, git_version='5ad02a4')
|
3 |
+
2025-03-01 01:26:13,631 ocrqa_create_bloom_filter.py:229 INFO: Starting Bloom Filter creation...
|
4 |
+
2025-03-01 01:26:13,631 ocrqa_create_bloom_filter.py:199 INFO: Processing real-word file: lex/en/modern_spelling.rw.txt
|
5 |
+
2025-03-01 01:26:13,711 ocrqa_create_bloom_filter.py:138 INFO: Processing frequency file: lex/en/enwiki.unigram.freq.tsv.bz2
|
6 |
+
2025-03-01 01:26:13,728 ocrqa_create_bloom_filter.py:163 WARNING: Skipping invalid line in frequency file lex/en/enwiki.unigram.freq.tsv.bz2: 107268
|
7 |
+
2025-03-01 01:26:24,792 ocrqa_create_bloom_filter.py:243 INFO: low_freq_excluded before removing parts from high-frequency words: 4575354
|
8 |
+
2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:251 INFO: low_freq_excluded after removing parts from high-frequency words: 3991288
|
9 |
+
2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:255 INFO: Lexical processing complete.
|
10 |
+
2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:259 INFO: - realwords_read: 129022
|
11 |
+
2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:259 INFO: - realwords_accepted: 129137
|
12 |
+
2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:259 INFO: - realwords_nonwords_filtered: 0
|
13 |
+
2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:259 INFO: - freq_words_read: 9710446
|
14 |
+
2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:259 INFO: - words_accepted: 2329192
|
15 |
+
2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:259 INFO: - subwords_accepted: 2836196
|
16 |
+
2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:259 INFO: - subwords_filtered: 0
|
17 |
+
2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:259 INFO: - low_freq_excluded: 3991288
|
18 |
+
2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:259 INFO: - invalid_lines_filtered: 1
|
19 |
+
2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:259 INFO: - single_char_words_filtered: 10825
|
20 |
+
2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:259 INFO: - freq_words_filtered: 7370428
|
21 |
+
2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:262 INFO: Estimated word count: 1597894
|
22 |
+
2025-03-01 01:26:25,315 ocrqa_create_bloom_filter.py:266 INFO: Bloom Filter created and saved to build.d/fp_prob_0.00001/ocrqa-wp_v1.0.5-en.bloom
|
23 |
+
2025-03-01 01:26:25,588 ocrqa_create_bloom_filter.py:288 INFO: Diagnosis Results:
|
24 |
+
2025-03-01 01:26:25,588 ocrqa_create_bloom_filter.py:289 INFO: - Excluded words in bloom filter: 0
|
25 |
+
2025-03-01 01:26:25,588 ocrqa_create_bloom_filter.py:290 INFO: - Known words not in bloom filter: 0
|
26 |
+
2025-03-01 01:26:26,327 ocrqa_create_bloom_filter.py:297 INFO: - Low-frequency words in bloom filter: 40
|
27 |
+
2025-03-01 01:26:26,327 ocrqa_create_bloom_filter.py:309 INFO: - Proportion of known words not in bloom filter: 0.00000000
|
28 |
+
2025-03-01 01:26:26,327 ocrqa_create_bloom_filter.py:317 INFO: - Proportion of low-frequency words in bloom filter: 0.00001002
|
models/ocrqa-wp_v1.0.5-fr.bloom.log
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-02-10 09:54:39,729 ocrqa_create_bloom_filter.py:425 INFO: Namespace(input_files=['lex/fr/realwords.rw.txt.bz2', 'lex/fr/morph.rw.txt.bz2', 'lex/fr/single_char.rw.txt.bz2', 'lex/fr/frwiki.unigram.freq.tsv.bz2', 'lex/fr/nonwords.nw.txt.bz2'], bloom_path='build.d/fp_prob_0.00001/ocrqa-wp_v1.0.5-fr.bloom', fp_probability=1e-05, log_level='INFO', log_file='build.d/fp_prob_0.00001/ocrqa-wp_v1.0.5-fr.bloom.log', config=None, min_frequency=2, single_char_min_frequency=20, diagnose_bloom=True)
|
2 |
+
2025-02-10 09:54:39,729 ocrqa_create_bloom_filter.py:226 INFO: Starting Bloom Filter creation...
|
3 |
+
2025-02-10 09:54:39,729 ocrqa_create_bloom_filter.py:178 INFO: Processing nonword file: lex/fr/nonwords.nw.txt.bz2
|
4 |
+
2025-02-10 09:54:39,729 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['gazelle']
|
5 |
+
2025-02-10 09:54:39,729 ocrqa_create_bloom_filter.py:190 INFO: Excluded 1 words that should never be added
|
6 |
+
2025-02-10 09:54:39,729 ocrqa_create_bloom_filter.py:196 INFO: Processing real-word file: lex/fr/realwords.rw.txt.bz2
|
7 |
+
2025-02-10 09:54:39,730 ocrqa_create_bloom_filter.py:196 INFO: Processing real-word file: lex/fr/morph.rw.txt.bz2
|
8 |
+
2025-02-10 09:54:40,372 ocrqa_create_bloom_filter.py:196 INFO: Processing real-word file: lex/fr/single_char.rw.txt.bz2
|
9 |
+
2025-02-10 09:54:40,373 ocrqa_create_bloom_filter.py:135 INFO: Processing frequency file: lex/fr/frwiki.unigram.freq.tsv.bz2
|
10 |
+
2025-02-10 09:54:45,072 ocrqa_create_bloom_filter.py:240 INFO: low_freq_excluded before removing parts from high-frequency words: 1552240
|
11 |
+
2025-02-10 09:54:45,172 ocrqa_create_bloom_filter.py:248 INFO: low_freq_excluded after removing parts from high-frequency words: 1228188
|
12 |
+
2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:252 INFO: Lexical processing complete.
|
13 |
+
2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:256 INFO: - nonwords_read: 1
|
14 |
+
2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:256 INFO: - nonwords_count: 1
|
15 |
+
2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:256 INFO: - realwords_read: 723250
|
16 |
+
2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:256 INFO: - realwords_accepted: 736917
|
17 |
+
2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:256 INFO: - realwords_nonwords_filtered: 1
|
18 |
+
2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:256 INFO: - freq_words_read: 4047539
|
19 |
+
2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:256 INFO: - words_accepted: 1992185
|
20 |
+
2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:256 INFO: - subwords_accepted: 2383447
|
21 |
+
2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:256 INFO: - subwords_filtered: 9
|
22 |
+
2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:256 INFO: - low_freq_excluded: 1228188
|
23 |
+
2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:256 INFO: - single_char_words_filtered: 6597
|
24 |
+
2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:256 INFO: - freq_words_filtered: 2048757
|
25 |
+
2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:259 INFO: Estimated word count: 1939265
|
26 |
+
2025-02-10 09:54:45,657 ocrqa_create_bloom_filter.py:263 INFO: Bloom Filter created and saved to build.d/fp_prob_0.00001/ocrqa-wp_v1.0.5-fr.bloom
|
27 |
+
2025-02-10 09:54:45,996 ocrqa_create_bloom_filter.py:285 INFO: Diagnosis Results:
|
28 |
+
2025-02-10 09:54:45,996 ocrqa_create_bloom_filter.py:286 INFO: - Excluded words in bloom filter: 0
|
29 |
+
2025-02-10 09:54:45,996 ocrqa_create_bloom_filter.py:287 INFO: - Known words not in bloom filter: 0
|
30 |
+
2025-02-10 09:54:46,146 ocrqa_create_bloom_filter.py:294 INFO: - Low-frequency words in bloom filter: 8
|
31 |
+
2025-02-10 09:54:46,146 ocrqa_create_bloom_filter.py:300 INFO: - Proportion of excluded words in bloom filter: 0.00000000
|
32 |
+
2025-02-10 09:54:46,146 ocrqa_create_bloom_filter.py:306 INFO: - Proportion of known words not in bloom filter: 0.00000000
|
33 |
+
2025-02-10 09:54:46,146 ocrqa_create_bloom_filter.py:314 INFO: - Proportion of low-frequency words in bloom filter: 0.00000651
|
models/ocrqa-wp_v1.0.5-lb.bloom.log
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-02-28 23:52:03,275 ocrqa_create_bloom_filter.py:121 INFO: Git version: 50f305d
|
2 |
+
2025-02-28 23:52:03,275 ocrqa_create_bloom_filter.py:434 INFO: Namespace(input_files=['lex/lb/lbwiki.unigram.freq.tsv.bz2', 'lex/lb/wortschatz.freq.tsv.bz2', 'lex/lb/modern_spelling.rw.txt', 'lex/lb/old_spelling.rw.txt', 'lex/lb/ocr_errors.nw.txt', 'lex/lb/realword_ocr_errors.nw.txt'], bloom_path='build.d/fp_prob_0.00001/ocrqa-wp_v1.0.5-lb.bloom', fp_probability=1e-05, log_level='INFO', log_file='build.d/fp_prob_0.00001/ocrqa-wp_v1.0.5-lb.bloom.log', config=None, min_frequency=2, single_char_min_frequency=20, diagnose_bloom=True, git_version='50f305d')
|
3 |
+
2025-02-28 23:52:03,275 ocrqa_create_bloom_filter.py:229 INFO: Starting Bloom Filter creation...
|
4 |
+
2025-02-28 23:52:03,275 ocrqa_create_bloom_filter.py:181 INFO: Processing nonword file: lex/lb/ocr_errors.nw.txt
|
5 |
+
2025-02-28 23:52:03,275 ocrqa_create_bloom_filter.py:190 INFO: Nonword tokens: ['ail']
|
6 |
+
2025-02-28 23:52:03,275 ocrqa_create_bloom_filter.py:190 INFO: Nonword tokens: ['dalt']
|
7 |
+
2025-02-28 23:52:03,275 ocrqa_create_bloom_filter.py:190 INFO: Nonword tokens: ['erl']
|
8 |
+
2025-02-28 23:52:03,275 ocrqa_create_bloom_filter.py:190 INFO: Nonword tokens: ['liât']
|
9 |
+
2025-02-28 23:52:03,276 ocrqa_create_bloom_filter.py:190 INFO: Nonword tokens: ['liuet']
|
10 |
+
2025-02-28 23:52:03,276 ocrqa_create_bloom_filter.py:190 INFO: Nonword tokens: ['secli']
|
11 |
+
2025-02-28 23:52:03,276 ocrqa_create_bloom_filter.py:190 INFO: Nonword tokens: ['zc']
|
12 |
+
2025-02-28 23:52:03,276 ocrqa_create_bloom_filter.py:193 INFO: Excluded 7 words that should never be added
|
13 |
+
2025-02-28 23:52:03,276 ocrqa_create_bloom_filter.py:181 INFO: Processing nonword file: lex/lb/realword_ocr_errors.nw.txt
|
14 |
+
2025-02-28 23:52:03,276 ocrqa_create_bloom_filter.py:193 INFO: Excluded 7 words that should never be added
|
15 |
+
2025-02-28 23:52:03,276 ocrqa_create_bloom_filter.py:138 INFO: Processing frequency file: lex/lb/lbwiki.unigram.freq.tsv.bz2
|
16 |
+
2025-02-28 23:52:03,289 ocrqa_create_bloom_filter.py:163 WARNING: Skipping invalid line in frequency file lex/lb/lbwiki.unigram.freq.tsv.bz2: 446
|
17 |
+
2025-02-28 23:52:03,591 ocrqa_create_bloom_filter.py:138 INFO: Processing frequency file: lex/lb/wortschatz.freq.tsv.bz2
|
18 |
+
2025-02-28 23:52:03,814 ocrqa_create_bloom_filter.py:163 WARNING: Skipping invalid line in frequency file lex/lb/wortschatz.freq.tsv.bz2:
|
19 |
+
2025-02-28 23:52:03,814 ocrqa_create_bloom_filter.py:199 INFO: Processing real-word file: lex/lb/modern_spelling.rw.txt
|
20 |
+
2025-02-28 23:52:03,925 ocrqa_create_bloom_filter.py:199 INFO: Processing real-word file: lex/lb/old_spelling.rw.txt
|
21 |
+
2025-02-28 23:52:03,925 ocrqa_create_bloom_filter.py:243 INFO: low_freq_excluded before removing parts from high-frequency words: 156199
|
22 |
+
2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:251 INFO: low_freq_excluded after removing parts from high-frequency words: 113360
|
23 |
+
2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:255 INFO: Lexical processing complete.
|
24 |
+
2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - nonwords_read: 7
|
25 |
+
2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - nonwords_count: 7
|
26 |
+
2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - freq_words_read: 487375
|
27 |
+
2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - words_accepted: 314693
|
28 |
+
2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - subwords_accepted: 366393
|
29 |
+
2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - subwords_filtered: 0
|
30 |
+
2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - low_freq_excluded: 113360
|
31 |
+
2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - invalid_lines_filtered: 2
|
32 |
+
2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - single_char_words_filtered: 334
|
33 |
+
2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - freq_words_filtered: 172346
|
34 |
+
2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - realwords_read: 152110
|
35 |
+
2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - realwords_accepted: 154531
|
36 |
+
2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - realwords_nonwords_filtered: 0
|
37 |
+
2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:262 INFO: Estimated word count: 285555
|
38 |
+
2025-02-28 23:52:04,014 ocrqa_create_bloom_filter.py:266 INFO: Bloom Filter created and saved to build.d/fp_prob_0.00001/ocrqa-wp_v1.0.5-lb.bloom
|
39 |
+
2025-02-28 23:52:04,066 ocrqa_create_bloom_filter.py:288 INFO: Diagnosis Results:
|
40 |
+
2025-02-28 23:52:04,066 ocrqa_create_bloom_filter.py:289 INFO: - Excluded words in bloom filter: 0
|
41 |
+
2025-02-28 23:52:04,066 ocrqa_create_bloom_filter.py:290 INFO: - Known words not in bloom filter: 0
|
42 |
+
2025-02-28 23:52:04,079 ocrqa_create_bloom_filter.py:297 INFO: - Low-frequency words in bloom filter: 1
|
43 |
+
2025-02-28 23:52:04,079 ocrqa_create_bloom_filter.py:303 INFO: - Proportion of excluded words in bloom filter: 0.00000000
|
44 |
+
2025-02-28 23:52:04,079 ocrqa_create_bloom_filter.py:309 INFO: - Proportion of known words not in bloom filter: 0.00000000
|
45 |
+
2025-02-28 23:52:04,079 ocrqa_create_bloom_filter.py:317 INFO: - Proportion of low-frequency words in bloom filter: 0.00000882
|
models/ocrqa-wp_v1.0.6-de.bloom.log
ADDED
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-02-14 16:07:47,840 ocrqa_create_bloom_filter.py:425 INFO: Namespace(input_files=['lex/de/realword_ocr_errors.nw.txt', 'lex/de/ocr_errors.nw.txt', 'lex/de/old_spelling.rw.txt', 'lex/de/modern_spelling.rw.txt', 'lex/de/dewiki.unigram.freq.tsv.bz2'], bloom_path='build.d/fp_prob_0.00001/ocrqa-wp_v1.0.6-de.bloom', fp_probability=1e-05, log_level='INFO', log_file='build.d/fp_prob_0.00001/ocrqa-wp_v1.0.6-de.bloom.log', config=None, min_frequency=2, single_char_min_frequency=20, diagnose_bloom=True)
|
2 |
+
2025-02-14 16:07:47,840 ocrqa_create_bloom_filter.py:226 INFO: Starting Bloom Filter creation...
|
3 |
+
2025-02-14 16:07:47,840 ocrqa_create_bloom_filter.py:178 INFO: Processing nonword file: lex/de/realword_ocr_errors.nw.txt
|
4 |
+
2025-02-14 16:07:47,840 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['negierung']
|
5 |
+
2025-02-14 16:07:47,840 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['negierungen']
|
6 |
+
2025-02-14 16:07:47,840 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['nidwaiden']
|
7 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ölten']
|
8 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['unterwaiden']
|
9 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['verlausen']
|
10 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['vertretet']
|
11 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:190 INFO: Excluded 7 words that should never be added
|
12 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:178 INFO: Processing nonword file: lex/de/ocr_errors.nw.txt
|
13 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['0oo']
|
14 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['@d']
|
15 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['@e']
|
16 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['@i']
|
17 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['@r']
|
18 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['@t']
|
19 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['aargan']
|
20 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['abbin']
|
21 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['abgefetzt']
|
22 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['abgereift']
|
23 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ahresbesoldung']
|
24 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['aisbann']
|
25 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['aneh']
|
26 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['anmeldungstermln']
|
27 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ariesheim']
|
28 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['aueh']
|
29 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ausgefetzt']
|
30 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['auslände']
|
31 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bahmen']
|
32 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bandesblatt']
|
33 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bebacht']
|
34 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bechnung']
|
35 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['befetzt']
|
36 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['befetzten']
|
37 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['begelung']
|
38 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['begierung']
|
39 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['begierungen']
|
40 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['begierungsrat']
|
41 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['behorde']
|
42 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['behorden']
|
43 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['berieht']
|
44 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bersonen']
|
45 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['besicht']
|
46 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bestimm']
|
47 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['betragt']
|
48 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['betreifend']
|
49 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['beutscher']
|
50 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bevision']
|
51 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bielleicht']
|
52 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bingier']
|
53 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bnndesblatt']
|
54 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bücksicht']
|
55 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bundesbehorden']
|
56 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['diefe']
|
57 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['diefer']
|
58 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['dingungen']
|
59 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['dnrch']
|
60 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['dnreh']
|
61 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['dureh']
|
62 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eahmen']
|
63 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eappen']
|
64 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eäte']
|
65 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eatifikation']
|
66 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ebenfall']
|
67 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ebruar']
|
68 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eechnung']
|
69 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eecht']
|
70 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eechte']
|
71 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eechts']
|
72 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eegel']
|
73 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eegelung']
|
74 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eegierung']
|
75 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eegierungen']
|
76 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eegierungsrat']
|
77 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eeglement']
|
78 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eeihe']
|
79 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eente']
|
80 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eenten']
|
81 |
+
2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eepublik']
|
82 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eesolution']
|
83 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eevision']
|
84 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ehur']
|
85 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eichter']
|
86 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eidgenossenschast']
|
87 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eidgenossische']
|
88 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eingefetzt']
|
89 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['einlabung']
|
90 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eirea']
|
91 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eiue']
|
92 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eldg']
|
93 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['elfaß']
|
94 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['endlieh']
|
95 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['engtischen']
|
96 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eobert']
|
97 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eolle']
|
98 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['erbalten']
|
99 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['erbbeben']
|
100 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['erfetzt']
|
101 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['erhallen']
|
102 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['erlauft']
|
103 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['erleiben']
|
104 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['erleibet']
|
105 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['erseht']
|
106 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eücksicht']
|
107 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['euenburg']
|
108 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['feiet']
|
109 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['feinet']
|
110 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['feite']
|
111 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['festgefetzt']
|
112 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['fetze']
|
113 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['fetzte']
|
114 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['fetzten']
|
115 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['fetzung']
|
116 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['feuersbrunft']
|
117 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['fiir']
|
118 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['fipoi']
|
119 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['fllr']
|
120 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['fönst']
|
121 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['fortfetzen']
|
122 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['fortfetzung']
|
123 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['fortgefetzt']
|
124 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['franzofen']
|
125 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['franzosischen']
|
126 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['frauken']
|
127 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['galleu']
|
128 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['gefetzt']
|
129 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['gemass']
|
130 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['gesellschast']
|
131 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['gesellsehast']
|
132 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['gewöhn']
|
133 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['gierung']
|
134 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['gischen']
|
135 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['grossere']
|
136 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['grossern']
|
137 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['hallung']
|
138 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['handelsund']
|
139 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['hauptfache']
|
140 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['heuligen']
|
141 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ì000']
|
142 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['iaht']
|
143 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['iahte']
|
144 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['iahten']
|
145 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['iiber']
|
146 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['infofern']
|
147 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['jnni']
|
148 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['kauton']
|
149 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['kautone']
|
150 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['kautons']
|
151 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['korden']
|
152 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['kreispostdirektiou']
|
153 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['leife']
|
154 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['liier']
|
155 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['lnzern']
|
156 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['locamo']
|
157 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['lostet']
|
158 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['luzernburg']
|
159 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['macbonalb']
|
160 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['mahnahmen']
|
161 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['mahregeln']
|
162 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['mährend']
|
163 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['matznahmen']
|
164 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['melben']
|
165 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['melche']
|
166 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ministet']
|
167 |
+
2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['mitleib']
|
168 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['moglich']
|
169 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['möglid']
|
170 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['moglieh']
|
171 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['naeh']
|
172 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['nieht']
|
173 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['nikiaus']
|
174 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['noeh']
|
175 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['nothig']
|
176 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['nothigen']
|
177 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ollem']
|
178 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['poft']
|
179 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['prankreich']
|
180 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['rebakteur']
|
181 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['reiburg']
|
182 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['reuenburg']
|
183 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['roieber']
|
184 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['rovember']
|
185 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ruffischen']
|
186 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['schisse']
|
187 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['schwierigleiten']
|
188 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['sehengen']
|
189 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['seihst']
|
190 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['siud']
|
191 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['srühern']
|
192 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['stanben']
|
193 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['stobt']
|
194 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['tatfache']
|
195 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['tatfachen']
|
196 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['teten']
|
197 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['thronrebe']
|
198 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['uater']
|
199 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['uicht']
|
200 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['unbein']
|
201 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['unfete']
|
202 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['unier']
|
203 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['unterschieb']
|
204 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['urfache']
|
205 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ürich']
|
206 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['verfetzt']
|
207 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['verkau']
|
208 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['verkauten']
|
209 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['verlauft']
|
210 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['verlaus']
|
211 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['verlehr']
|
212 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['vorfitz']
|
213 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['vstrr']
|
214 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['webet']
|
215 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['welehe']
|
216 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['wnrde']
|
217 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['znm']
|
218 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['zurlch']
|
219 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:190 INFO: Excluded 213 words that should never be added
|
220 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:196 INFO: Processing real-word file: lex/de/old_spelling.rw.txt
|
221 |
+
2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:196 INFO: Processing real-word file: lex/de/modern_spelling.rw.txt
|
222 |
+
2025-02-14 16:07:47,844 ocrqa_create_bloom_filter.py:135 INFO: Processing frequency file: lex/de/dewiki.unigram.freq.tsv.bz2
|
223 |
+
2025-02-14 16:08:00,551 ocrqa_create_bloom_filter.py:240 INFO: low_freq_excluded before removing parts from high-frequency words: 3780824
|
224 |
+
2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:248 INFO: low_freq_excluded after removing parts from high-frequency words: 3288723
|
225 |
+
2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:252 INFO: Lexical processing complete.
|
226 |
+
2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:256 INFO: - nonwords_read: 213
|
227 |
+
2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:256 INFO: - nonwords_count: 213
|
228 |
+
2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:256 INFO: - realwords_read: 488
|
229 |
+
2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:256 INFO: - realwords_accepted: 488
|
230 |
+
2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:256 INFO: - realwords_nonwords_filtered: 0
|
231 |
+
2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:256 INFO: - freq_words_read: 9199714
|
232 |
+
2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:256 INFO: - words_accepted: 4143119
|
233 |
+
2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:256 INFO: - subwords_accepted: 5029202
|
234 |
+
2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:256 INFO: - subwords_filtered: 1
|
235 |
+
2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:256 INFO: - low_freq_excluded: 3288723
|
236 |
+
2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:256 INFO: - single_char_words_filtered: 6719
|
237 |
+
2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:256 INFO: - freq_words_filtered: 5049876
|
238 |
+
2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:259 INFO: Estimated word count: 3358453
|
239 |
+
2025-02-14 16:08:01,663 ocrqa_create_bloom_filter.py:263 INFO: Bloom Filter created and saved to build.d/fp_prob_0.00001/ocrqa-wp_v1.0.6-de.bloom
|
240 |
+
2025-02-14 16:08:02,322 ocrqa_create_bloom_filter.py:285 INFO: Diagnosis Results:
|
241 |
+
2025-02-14 16:08:02,323 ocrqa_create_bloom_filter.py:286 INFO: - Excluded words in bloom filter: 0
|
242 |
+
2025-02-14 16:08:02,323 ocrqa_create_bloom_filter.py:287 INFO: - Known words not in bloom filter: 0
|
243 |
+
2025-02-14 16:08:02,821 ocrqa_create_bloom_filter.py:294 INFO: - Low-frequency words in bloom filter: 25
|
244 |
+
2025-02-14 16:08:02,821 ocrqa_create_bloom_filter.py:300 INFO: - Proportion of excluded words in bloom filter: 0.00000000
|
245 |
+
2025-02-14 16:08:02,821 ocrqa_create_bloom_filter.py:306 INFO: - Proportion of known words not in bloom filter: 0.00000000
|
246 |
+
2025-02-14 16:08:02,821 ocrqa_create_bloom_filter.py:314 INFO: - Proportion of low-frequency words in bloom filter: 0.00000760
|
models/ocrqa-wp_v1.0.6-fr.bloom.log
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-02-14 16:08:03,171 ocrqa_create_bloom_filter.py:425 INFO: Namespace(input_files=['lex/fr/realword_ocr_errors.nw.txt', 'lex/fr/ocr_errors.nw.txt', 'lex/fr/old_spelling.rw.txt', 'lex/fr/modern_spelling.rw.txt', 'lex/fr/morph.rw.txt.bz2', 'lex/fr/single_char.rw.txt.bz2', 'lex/fr/frwiki.unigram.freq.tsv.bz2'], bloom_path='build.d/fp_prob_0.00001/ocrqa-wp_v1.0.6-fr.bloom', fp_probability=1e-05, log_level='INFO', log_file='build.d/fp_prob_0.00001/ocrqa-wp_v1.0.6-fr.bloom.log', config=None, min_frequency=2, single_char_min_frequency=20, diagnose_bloom=True)
|
2 |
+
2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:226 INFO: Starting Bloom Filter creation...
|
3 |
+
2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:178 INFO: Processing nonword file: lex/fr/realword_ocr_errors.nw.txt
|
4 |
+
2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['gazelle']
|
5 |
+
2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:190 INFO: Excluded 1 words that should never be added
|
6 |
+
2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:178 INFO: Processing nonword file: lex/fr/ocr_errors.nw.txt
|
7 |
+
2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bàie']
|
8 |
+
2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['chepin']
|
9 |
+
2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['dftce']
|
10 |
+
2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ì']
|
11 |
+
2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['locamo']
|
12 |
+
2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['mmmi']
|
13 |
+
2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['mmml']
|
14 |
+
2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['mmmv']
|
15 |
+
2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['neuchat']
|
16 |
+
2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['neuchàtel']
|
17 |
+
2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ölten']
|
18 |
+
2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ponr']
|
19 |
+
2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['pribourg']
|
20 |
+
2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['remercîments']
|
21 |
+
2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['sadresser']
|
22 |
+
2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['zuberbuhler']
|
23 |
+
2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:190 INFO: Excluded 17 words that should never be added
|
24 |
+
2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:196 INFO: Processing real-word file: lex/fr/old_spelling.rw.txt
|
25 |
+
2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:196 INFO: Processing real-word file: lex/fr/modern_spelling.rw.txt
|
26 |
+
2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:196 INFO: Processing real-word file: lex/fr/morph.rw.txt.bz2
|
27 |
+
2025-02-14 16:08:03,856 ocrqa_create_bloom_filter.py:196 INFO: Processing real-word file: lex/fr/single_char.rw.txt.bz2
|
28 |
+
2025-02-14 16:08:03,857 ocrqa_create_bloom_filter.py:135 INFO: Processing frequency file: lex/fr/frwiki.unigram.freq.tsv.bz2
|
29 |
+
2025-02-14 16:08:08,932 ocrqa_create_bloom_filter.py:240 INFO: low_freq_excluded before removing parts from high-frequency words: 1548627
|
30 |
+
2025-02-14 16:08:09,031 ocrqa_create_bloom_filter.py:248 INFO: low_freq_excluded after removing parts from high-frequency words: 1224019
|
31 |
+
2025-02-14 16:08:09,031 ocrqa_create_bloom_filter.py:252 INFO: Lexical processing complete.
|
32 |
+
2025-02-14 16:08:09,031 ocrqa_create_bloom_filter.py:256 INFO: - nonwords_read: 17
|
33 |
+
2025-02-14 16:08:09,031 ocrqa_create_bloom_filter.py:256 INFO: - nonwords_count: 17
|
34 |
+
2025-02-14 16:08:09,031 ocrqa_create_bloom_filter.py:256 INFO: - realwords_read: 723486
|
35 |
+
2025-02-14 16:08:09,031 ocrqa_create_bloom_filter.py:256 INFO: - realwords_accepted: 737153
|
36 |
+
2025-02-14 16:08:09,031 ocrqa_create_bloom_filter.py:256 INFO: - realwords_nonwords_filtered: 1
|
37 |
+
2025-02-14 16:08:09,031 ocrqa_create_bloom_filter.py:256 INFO: - freq_words_read: 4047539
|
38 |
+
2025-02-14 16:08:09,031 ocrqa_create_bloom_filter.py:256 INFO: - words_accepted: 1992185
|
39 |
+
2025-02-14 16:08:09,032 ocrqa_create_bloom_filter.py:256 INFO: - subwords_accepted: 2385659
|
40 |
+
2025-02-14 16:08:09,032 ocrqa_create_bloom_filter.py:256 INFO: - subwords_filtered: 9
|
41 |
+
2025-02-14 16:08:09,032 ocrqa_create_bloom_filter.py:256 INFO: - low_freq_excluded: 1224019
|
42 |
+
2025-02-14 16:08:09,032 ocrqa_create_bloom_filter.py:256 INFO: - single_char_words_filtered: 6597
|
43 |
+
2025-02-14 16:08:09,032 ocrqa_create_bloom_filter.py:256 INFO: - freq_words_filtered: 2048757
|
44 |
+
2025-02-14 16:08:09,032 ocrqa_create_bloom_filter.py:259 INFO: Estimated word count: 1937683
|
45 |
+
2025-02-14 16:08:09,562 ocrqa_create_bloom_filter.py:263 INFO: Bloom Filter created and saved to build.d/fp_prob_0.00001/ocrqa-wp_v1.0.6-fr.bloom
|
46 |
+
2025-02-14 16:08:09,924 ocrqa_create_bloom_filter.py:285 INFO: Diagnosis Results:
|
47 |
+
2025-02-14 16:08:09,924 ocrqa_create_bloom_filter.py:286 INFO: - Excluded words in bloom filter: 0
|
48 |
+
2025-02-14 16:08:09,924 ocrqa_create_bloom_filter.py:287 INFO: - Known words not in bloom filter: 0
|
49 |
+
2025-02-14 16:08:10,101 ocrqa_create_bloom_filter.py:294 INFO: - Low-frequency words in bloom filter: 15
|
50 |
+
2025-02-14 16:08:10,101 ocrqa_create_bloom_filter.py:300 INFO: - Proportion of excluded words in bloom filter: 0.00000000
|
51 |
+
2025-02-14 16:08:10,101 ocrqa_create_bloom_filter.py:306 INFO: - Proportion of known words not in bloom filter: 0.00000000
|
52 |
+
2025-02-14 16:08:10,101 ocrqa_create_bloom_filter.py:314 INFO: - Proportion of low-frequency words in bloom filter: 0.00001225
|
lang_detect.py → ocr_qa_assessment.py
RENAMED
@@ -1,7 +1,7 @@
|
|
1 |
from transformers import Pipeline
|
2 |
|
3 |
|
4 |
-
class
|
5 |
|
6 |
def _sanitize_parameters(self, **kwargs):
|
7 |
preprocess_kwargs = {}
|
|
|
1 |
from transformers import Pipeline
|
2 |
|
3 |
|
4 |
+
class QAAssessmentPipeline(Pipeline):
|
5 |
|
6 |
def _sanitize_parameters(self, **kwargs):
|
7 |
preprocess_kwargs = {}
|
test.py
DELETED
@@ -1,13 +0,0 @@
|
|
1 |
-
from transformers import AutoModelForTokenClassification, AutoTokenizer
|
2 |
-
from transformers import pipeline
|
3 |
-
|
4 |
-
MODEL_NAME = "emanuelaboros/lang-detect"
|
5 |
-
|
6 |
-
lang_pipeline = pipeline(
|
7 |
-
"lang-detect", model=MODEL_NAME, trust_remote_code=True, device="cpu"
|
8 |
-
)
|
9 |
-
|
10 |
-
sentence = "En l'an 1348, au plus fort des ravages de la peste noire à travers l'Europe, le Royaume de France se trouvait à la fois au bord du désespoir et face à une opportunité. À la cour du roi Philippe VI, les murs du Louvre étaient animés par les rapports sombres venus de Paris et des villes environnantes. La peste ne montrait aucun signe de répit, et le chancelier Guillaume de Nogaret, le conseiller le plus fidèle du roi, portait le lourd fardeau de gérer la survie du royaume."
|
11 |
-
|
12 |
-
langs = lang_pipeline(sentence)
|
13 |
-
langs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_floret.py
DELETED
@@ -1,10 +0,0 @@
|
|
1 |
-
import floret
|
2 |
-
|
3 |
-
model_floret = floret.load_model("LID-40-3-2000000-1-4.bin")
|
4 |
-
print(type(model_floret))
|
5 |
-
input_ids = 'this is a text'
|
6 |
-
|
7 |
-
input_ids = "En l'an 1348, au plus fort des ravages de la peste noire à travers l'Europe, le Royaume de France se trouvait à la fois au bord du désespoir et face à une opportunité. À la cour du roi Philippe VI, les murs du Louvre étaient animés par les rapports sombres venus de Paris et des villes environnantes. La peste ne montrait aucun signe de répit, et le chancelier Guillaume de Nogaret, le conseiller le plus fidèle du roi, portait le lourd fardeau de gérer la survie du royaume."
|
8 |
-
|
9 |
-
print(model_floret.predict([input_ids], k=1))
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|