emanuelaboros commited on
Commit
53e37f7
·
1 Parent(s): b448ba9

add models and config

Browse files
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
config.json CHANGED
@@ -1,21 +1,21 @@
1
  {
2
- "_name_or_path": "emanuelaboros/lang-detect",
3
  "architectures": [
4
- "LangDetectorModel"
5
  ],
6
  "filename": "LID-40-3-2000000-1-4.bin",
7
  "attention_probs_dropout_prob": 0.1,
8
  "auto_map": {
9
- "AutoConfig": "configuration_lang.ImpressoConfig",
10
- "AutoModelForTokenClassification": "modeling_lang.LangDetectorModel"
11
  },
12
  "custom_pipelines": {
13
  "lang-detect": {
14
- "impl": "lang_detect.LangDetectionPipeline",
15
  "pt": "AutoModelForTokenClassification"
16
  }
17
  },
18
- "model_type": "lang_detect",
19
  "torch_dtype": "float32",
20
  "transformers_version": "4.49.0"
21
  }
 
1
  {
2
+ "_name_or_path": "impresso-project/ocr-quality-assessment-light",
3
  "architectures": [
4
+ "QAAssessmentModel"
5
  ],
6
  "filename": "LID-40-3-2000000-1-4.bin",
7
  "attention_probs_dropout_prob": 0.1,
8
  "auto_map": {
9
+ "AutoConfig": "configuration_ocrqa.ImpressoConfig",
10
+ "AutoModelForTokenClassification": "modeling_ocrqa.QAAssessmentModel"
11
  },
12
  "custom_pipelines": {
13
  "lang-detect": {
14
+ "impl": "ocr_qa_assessment.QAAssessmentPipeline",
15
  "pt": "AutoModelForTokenClassification"
16
  }
17
  },
18
+ "model_type": "ocr_qa_assessment",
19
  "torch_dtype": "float32",
20
  "transformers_version": "4.49.0"
21
  }
configuration_lang.py → configuration_ocrqa.py RENAMED
@@ -3,7 +3,7 @@ import torch
3
 
4
 
5
  class ImpressoConfig(PretrainedConfig):
6
- model_type = "lang_detect"
7
 
8
  def __init__(
9
  self,
 
3
 
4
 
5
  class ImpressoConfig(PretrainedConfig):
6
+ model_type = "ocr_qa_assessment"
7
 
8
  def __init__(
9
  self,
modeling_lang.py → modeling_ocrqa.py RENAMED
@@ -10,7 +10,7 @@ from .configuration_lang import ImpressoConfig
10
  logger = logging.getLogger(__name__)
11
 
12
 
13
- class LangDetectorModel(PreTrainedModel):
14
  config_class = ImpressoConfig
15
 
16
  def __init__(self, config):
 
10
  logger = logging.getLogger(__name__)
11
 
12
 
13
+ class QAAssessmentModel(PreTrainedModel):
14
  config_class = ImpressoConfig
15
 
16
  def __init__(self, config):
models/ocrqa-wp_v1.0.5-de.bloom.log ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-02-10 09:54:25,418 ocrqa_create_bloom_filter.py:425 INFO: Namespace(input_files=['lex/de/realwords.rw.txt.bz2', 'lex/de/dewiki.unigram.freq.tsv.bz2', 'lex/de/nonwords.nw.txt.bz2'], bloom_path='build.d/fp_prob_0.00001/ocrqa-wp_v1.0.5-de.bloom', fp_probability=1e-05, log_level='INFO', log_file='build.d/fp_prob_0.00001/ocrqa-wp_v1.0.5-de.bloom.log', config=None, min_frequency=2, single_char_min_frequency=20, diagnose_bloom=True)
2
+ 2025-02-10 09:54:25,418 ocrqa_create_bloom_filter.py:226 INFO: Starting Bloom Filter creation...
3
+ 2025-02-10 09:54:25,418 ocrqa_create_bloom_filter.py:178 INFO: Processing nonword file: lex/de/nonwords.nw.txt.bz2
4
+ 2025-02-10 09:54:25,418 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['negierung']
5
+ 2025-02-10 09:54:25,418 ocrqa_create_bloom_filter.py:190 INFO: Excluded 1 words that should never be added
6
+ 2025-02-10 09:54:25,418 ocrqa_create_bloom_filter.py:196 INFO: Processing real-word file: lex/de/realwords.rw.txt.bz2
7
+ 2025-02-10 09:54:25,418 ocrqa_create_bloom_filter.py:135 INFO: Processing frequency file: lex/de/dewiki.unigram.freq.tsv.bz2
8
+ 2025-02-10 09:54:37,205 ocrqa_create_bloom_filter.py:240 INFO: low_freq_excluded before removing parts from high-frequency words: 3780824
9
+ 2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:248 INFO: low_freq_excluded after removing parts from high-frequency words: 3288865
10
+ 2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:252 INFO: Lexical processing complete.
11
+ 2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:256 INFO: - nonwords_read: 1
12
+ 2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:256 INFO: - nonwords_count: 1
13
+ 2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:256 INFO: - realwords_read: 8
14
+ 2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:256 INFO: - realwords_accepted: 8
15
+ 2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:256 INFO: - realwords_nonwords_filtered: 0
16
+ 2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:256 INFO: - freq_words_read: 9199714
17
+ 2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:256 INFO: - words_accepted: 4143119
18
+ 2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:256 INFO: - subwords_accepted: 5029202
19
+ 2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:256 INFO: - subwords_filtered: 1
20
+ 2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:256 INFO: - low_freq_excluded: 3288865
21
+ 2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:256 INFO: - single_char_words_filtered: 6719
22
+ 2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:256 INFO: - freq_words_filtered: 5049876
23
+ 2025-02-10 09:54:37,366 ocrqa_create_bloom_filter.py:259 INFO: Estimated word count: 3357976
24
+ 2025-02-10 09:54:38,245 ocrqa_create_bloom_filter.py:263 INFO: Bloom Filter created and saved to build.d/fp_prob_0.00001/ocrqa-wp_v1.0.5-de.bloom
25
+ 2025-02-10 09:54:38,860 ocrqa_create_bloom_filter.py:285 INFO: Diagnosis Results:
26
+ 2025-02-10 09:54:38,860 ocrqa_create_bloom_filter.py:286 INFO: - Excluded words in bloom filter: 0
27
+ 2025-02-10 09:54:38,860 ocrqa_create_bloom_filter.py:287 INFO: - Known words not in bloom filter: 0
28
+ 2025-02-10 09:54:39,386 ocrqa_create_bloom_filter.py:294 INFO: - Low-frequency words in bloom filter: 33
29
+ 2025-02-10 09:54:39,386 ocrqa_create_bloom_filter.py:300 INFO: - Proportion of excluded words in bloom filter: 0.00000000
30
+ 2025-02-10 09:54:39,386 ocrqa_create_bloom_filter.py:306 INFO: - Proportion of known words not in bloom filter: 0.00000000
31
+ 2025-02-10 09:54:39,386 ocrqa_create_bloom_filter.py:314 INFO: - Proportion of low-frequency words in bloom filter: 0.00001003
models/ocrqa-wp_v1.0.5-en.bloom.log ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-01 01:26:13,631 ocrqa_create_bloom_filter.py:121 INFO: Git version: 5ad02a4
2
+ 2025-03-01 01:26:13,631 ocrqa_create_bloom_filter.py:434 INFO: Namespace(input_files=['lex/en/modern_spelling.rw.txt', 'lex/en/enwiki.unigram.freq.tsv.bz2'], bloom_path='build.d/fp_prob_0.00001/ocrqa-wp_v1.0.5-en.bloom', fp_probability=1e-05, log_level='INFO', log_file='build.d/fp_prob_0.00001/ocrqa-wp_v1.0.5-en.bloom.log', config=None, min_frequency=5, single_char_min_frequency=20, diagnose_bloom=True, git_version='5ad02a4')
3
+ 2025-03-01 01:26:13,631 ocrqa_create_bloom_filter.py:229 INFO: Starting Bloom Filter creation...
4
+ 2025-03-01 01:26:13,631 ocrqa_create_bloom_filter.py:199 INFO: Processing real-word file: lex/en/modern_spelling.rw.txt
5
+ 2025-03-01 01:26:13,711 ocrqa_create_bloom_filter.py:138 INFO: Processing frequency file: lex/en/enwiki.unigram.freq.tsv.bz2
6
+ 2025-03-01 01:26:13,728 ocrqa_create_bloom_filter.py:163 WARNING: Skipping invalid line in frequency file lex/en/enwiki.unigram.freq.tsv.bz2: 107268
7
+ 2025-03-01 01:26:24,792 ocrqa_create_bloom_filter.py:243 INFO: low_freq_excluded before removing parts from high-frequency words: 4575354
8
+ 2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:251 INFO: low_freq_excluded after removing parts from high-frequency words: 3991288
9
+ 2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:255 INFO: Lexical processing complete.
10
+ 2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:259 INFO: - realwords_read: 129022
11
+ 2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:259 INFO: - realwords_accepted: 129137
12
+ 2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:259 INFO: - realwords_nonwords_filtered: 0
13
+ 2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:259 INFO: - freq_words_read: 9710446
14
+ 2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:259 INFO: - words_accepted: 2329192
15
+ 2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:259 INFO: - subwords_accepted: 2836196
16
+ 2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:259 INFO: - subwords_filtered: 0
17
+ 2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:259 INFO: - low_freq_excluded: 3991288
18
+ 2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:259 INFO: - invalid_lines_filtered: 1
19
+ 2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:259 INFO: - single_char_words_filtered: 10825
20
+ 2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:259 INFO: - freq_words_filtered: 7370428
21
+ 2025-03-01 01:26:24,928 ocrqa_create_bloom_filter.py:262 INFO: Estimated word count: 1597894
22
+ 2025-03-01 01:26:25,315 ocrqa_create_bloom_filter.py:266 INFO: Bloom Filter created and saved to build.d/fp_prob_0.00001/ocrqa-wp_v1.0.5-en.bloom
23
+ 2025-03-01 01:26:25,588 ocrqa_create_bloom_filter.py:288 INFO: Diagnosis Results:
24
+ 2025-03-01 01:26:25,588 ocrqa_create_bloom_filter.py:289 INFO: - Excluded words in bloom filter: 0
25
+ 2025-03-01 01:26:25,588 ocrqa_create_bloom_filter.py:290 INFO: - Known words not in bloom filter: 0
26
+ 2025-03-01 01:26:26,327 ocrqa_create_bloom_filter.py:297 INFO: - Low-frequency words in bloom filter: 40
27
+ 2025-03-01 01:26:26,327 ocrqa_create_bloom_filter.py:309 INFO: - Proportion of known words not in bloom filter: 0.00000000
28
+ 2025-03-01 01:26:26,327 ocrqa_create_bloom_filter.py:317 INFO: - Proportion of low-frequency words in bloom filter: 0.00001002
models/ocrqa-wp_v1.0.5-fr.bloom.log ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-02-10 09:54:39,729 ocrqa_create_bloom_filter.py:425 INFO: Namespace(input_files=['lex/fr/realwords.rw.txt.bz2', 'lex/fr/morph.rw.txt.bz2', 'lex/fr/single_char.rw.txt.bz2', 'lex/fr/frwiki.unigram.freq.tsv.bz2', 'lex/fr/nonwords.nw.txt.bz2'], bloom_path='build.d/fp_prob_0.00001/ocrqa-wp_v1.0.5-fr.bloom', fp_probability=1e-05, log_level='INFO', log_file='build.d/fp_prob_0.00001/ocrqa-wp_v1.0.5-fr.bloom.log', config=None, min_frequency=2, single_char_min_frequency=20, diagnose_bloom=True)
2
+ 2025-02-10 09:54:39,729 ocrqa_create_bloom_filter.py:226 INFO: Starting Bloom Filter creation...
3
+ 2025-02-10 09:54:39,729 ocrqa_create_bloom_filter.py:178 INFO: Processing nonword file: lex/fr/nonwords.nw.txt.bz2
4
+ 2025-02-10 09:54:39,729 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['gazelle']
5
+ 2025-02-10 09:54:39,729 ocrqa_create_bloom_filter.py:190 INFO: Excluded 1 words that should never be added
6
+ 2025-02-10 09:54:39,729 ocrqa_create_bloom_filter.py:196 INFO: Processing real-word file: lex/fr/realwords.rw.txt.bz2
7
+ 2025-02-10 09:54:39,730 ocrqa_create_bloom_filter.py:196 INFO: Processing real-word file: lex/fr/morph.rw.txt.bz2
8
+ 2025-02-10 09:54:40,372 ocrqa_create_bloom_filter.py:196 INFO: Processing real-word file: lex/fr/single_char.rw.txt.bz2
9
+ 2025-02-10 09:54:40,373 ocrqa_create_bloom_filter.py:135 INFO: Processing frequency file: lex/fr/frwiki.unigram.freq.tsv.bz2
10
+ 2025-02-10 09:54:45,072 ocrqa_create_bloom_filter.py:240 INFO: low_freq_excluded before removing parts from high-frequency words: 1552240
11
+ 2025-02-10 09:54:45,172 ocrqa_create_bloom_filter.py:248 INFO: low_freq_excluded after removing parts from high-frequency words: 1228188
12
+ 2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:252 INFO: Lexical processing complete.
13
+ 2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:256 INFO: - nonwords_read: 1
14
+ 2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:256 INFO: - nonwords_count: 1
15
+ 2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:256 INFO: - realwords_read: 723250
16
+ 2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:256 INFO: - realwords_accepted: 736917
17
+ 2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:256 INFO: - realwords_nonwords_filtered: 1
18
+ 2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:256 INFO: - freq_words_read: 4047539
19
+ 2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:256 INFO: - words_accepted: 1992185
20
+ 2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:256 INFO: - subwords_accepted: 2383447
21
+ 2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:256 INFO: - subwords_filtered: 9
22
+ 2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:256 INFO: - low_freq_excluded: 1228188
23
+ 2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:256 INFO: - single_char_words_filtered: 6597
24
+ 2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:256 INFO: - freq_words_filtered: 2048757
25
+ 2025-02-10 09:54:45,173 ocrqa_create_bloom_filter.py:259 INFO: Estimated word count: 1939265
26
+ 2025-02-10 09:54:45,657 ocrqa_create_bloom_filter.py:263 INFO: Bloom Filter created and saved to build.d/fp_prob_0.00001/ocrqa-wp_v1.0.5-fr.bloom
27
+ 2025-02-10 09:54:45,996 ocrqa_create_bloom_filter.py:285 INFO: Diagnosis Results:
28
+ 2025-02-10 09:54:45,996 ocrqa_create_bloom_filter.py:286 INFO: - Excluded words in bloom filter: 0
29
+ 2025-02-10 09:54:45,996 ocrqa_create_bloom_filter.py:287 INFO: - Known words not in bloom filter: 0
30
+ 2025-02-10 09:54:46,146 ocrqa_create_bloom_filter.py:294 INFO: - Low-frequency words in bloom filter: 8
31
+ 2025-02-10 09:54:46,146 ocrqa_create_bloom_filter.py:300 INFO: - Proportion of excluded words in bloom filter: 0.00000000
32
+ 2025-02-10 09:54:46,146 ocrqa_create_bloom_filter.py:306 INFO: - Proportion of known words not in bloom filter: 0.00000000
33
+ 2025-02-10 09:54:46,146 ocrqa_create_bloom_filter.py:314 INFO: - Proportion of low-frequency words in bloom filter: 0.00000651
models/ocrqa-wp_v1.0.5-lb.bloom.log ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-02-28 23:52:03,275 ocrqa_create_bloom_filter.py:121 INFO: Git version: 50f305d
2
+ 2025-02-28 23:52:03,275 ocrqa_create_bloom_filter.py:434 INFO: Namespace(input_files=['lex/lb/lbwiki.unigram.freq.tsv.bz2', 'lex/lb/wortschatz.freq.tsv.bz2', 'lex/lb/modern_spelling.rw.txt', 'lex/lb/old_spelling.rw.txt', 'lex/lb/ocr_errors.nw.txt', 'lex/lb/realword_ocr_errors.nw.txt'], bloom_path='build.d/fp_prob_0.00001/ocrqa-wp_v1.0.5-lb.bloom', fp_probability=1e-05, log_level='INFO', log_file='build.d/fp_prob_0.00001/ocrqa-wp_v1.0.5-lb.bloom.log', config=None, min_frequency=2, single_char_min_frequency=20, diagnose_bloom=True, git_version='50f305d')
3
+ 2025-02-28 23:52:03,275 ocrqa_create_bloom_filter.py:229 INFO: Starting Bloom Filter creation...
4
+ 2025-02-28 23:52:03,275 ocrqa_create_bloom_filter.py:181 INFO: Processing nonword file: lex/lb/ocr_errors.nw.txt
5
+ 2025-02-28 23:52:03,275 ocrqa_create_bloom_filter.py:190 INFO: Nonword tokens: ['ail']
6
+ 2025-02-28 23:52:03,275 ocrqa_create_bloom_filter.py:190 INFO: Nonword tokens: ['dalt']
7
+ 2025-02-28 23:52:03,275 ocrqa_create_bloom_filter.py:190 INFO: Nonword tokens: ['erl']
8
+ 2025-02-28 23:52:03,275 ocrqa_create_bloom_filter.py:190 INFO: Nonword tokens: ['liât']
9
+ 2025-02-28 23:52:03,276 ocrqa_create_bloom_filter.py:190 INFO: Nonword tokens: ['liuet']
10
+ 2025-02-28 23:52:03,276 ocrqa_create_bloom_filter.py:190 INFO: Nonword tokens: ['secli']
11
+ 2025-02-28 23:52:03,276 ocrqa_create_bloom_filter.py:190 INFO: Nonword tokens: ['zc']
12
+ 2025-02-28 23:52:03,276 ocrqa_create_bloom_filter.py:193 INFO: Excluded 7 words that should never be added
13
+ 2025-02-28 23:52:03,276 ocrqa_create_bloom_filter.py:181 INFO: Processing nonword file: lex/lb/realword_ocr_errors.nw.txt
14
+ 2025-02-28 23:52:03,276 ocrqa_create_bloom_filter.py:193 INFO: Excluded 7 words that should never be added
15
+ 2025-02-28 23:52:03,276 ocrqa_create_bloom_filter.py:138 INFO: Processing frequency file: lex/lb/lbwiki.unigram.freq.tsv.bz2
16
+ 2025-02-28 23:52:03,289 ocrqa_create_bloom_filter.py:163 WARNING: Skipping invalid line in frequency file lex/lb/lbwiki.unigram.freq.tsv.bz2: 446
17
+ 2025-02-28 23:52:03,591 ocrqa_create_bloom_filter.py:138 INFO: Processing frequency file: lex/lb/wortschatz.freq.tsv.bz2
18
+ 2025-02-28 23:52:03,814 ocrqa_create_bloom_filter.py:163 WARNING: Skipping invalid line in frequency file lex/lb/wortschatz.freq.tsv.bz2:
19
+ 2025-02-28 23:52:03,814 ocrqa_create_bloom_filter.py:199 INFO: Processing real-word file: lex/lb/modern_spelling.rw.txt
20
+ 2025-02-28 23:52:03,925 ocrqa_create_bloom_filter.py:199 INFO: Processing real-word file: lex/lb/old_spelling.rw.txt
21
+ 2025-02-28 23:52:03,925 ocrqa_create_bloom_filter.py:243 INFO: low_freq_excluded before removing parts from high-frequency words: 156199
22
+ 2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:251 INFO: low_freq_excluded after removing parts from high-frequency words: 113360
23
+ 2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:255 INFO: Lexical processing complete.
24
+ 2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - nonwords_read: 7
25
+ 2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - nonwords_count: 7
26
+ 2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - freq_words_read: 487375
27
+ 2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - words_accepted: 314693
28
+ 2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - subwords_accepted: 366393
29
+ 2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - subwords_filtered: 0
30
+ 2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - low_freq_excluded: 113360
31
+ 2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - invalid_lines_filtered: 2
32
+ 2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - single_char_words_filtered: 334
33
+ 2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - freq_words_filtered: 172346
34
+ 2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - realwords_read: 152110
35
+ 2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - realwords_accepted: 154531
36
+ 2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:259 INFO: - realwords_nonwords_filtered: 0
37
+ 2025-02-28 23:52:03,942 ocrqa_create_bloom_filter.py:262 INFO: Estimated word count: 285555
38
+ 2025-02-28 23:52:04,014 ocrqa_create_bloom_filter.py:266 INFO: Bloom Filter created and saved to build.d/fp_prob_0.00001/ocrqa-wp_v1.0.5-lb.bloom
39
+ 2025-02-28 23:52:04,066 ocrqa_create_bloom_filter.py:288 INFO: Diagnosis Results:
40
+ 2025-02-28 23:52:04,066 ocrqa_create_bloom_filter.py:289 INFO: - Excluded words in bloom filter: 0
41
+ 2025-02-28 23:52:04,066 ocrqa_create_bloom_filter.py:290 INFO: - Known words not in bloom filter: 0
42
+ 2025-02-28 23:52:04,079 ocrqa_create_bloom_filter.py:297 INFO: - Low-frequency words in bloom filter: 1
43
+ 2025-02-28 23:52:04,079 ocrqa_create_bloom_filter.py:303 INFO: - Proportion of excluded words in bloom filter: 0.00000000
44
+ 2025-02-28 23:52:04,079 ocrqa_create_bloom_filter.py:309 INFO: - Proportion of known words not in bloom filter: 0.00000000
45
+ 2025-02-28 23:52:04,079 ocrqa_create_bloom_filter.py:317 INFO: - Proportion of low-frequency words in bloom filter: 0.00000882
models/ocrqa-wp_v1.0.6-de.bloom.log ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-02-14 16:07:47,840 ocrqa_create_bloom_filter.py:425 INFO: Namespace(input_files=['lex/de/realword_ocr_errors.nw.txt', 'lex/de/ocr_errors.nw.txt', 'lex/de/old_spelling.rw.txt', 'lex/de/modern_spelling.rw.txt', 'lex/de/dewiki.unigram.freq.tsv.bz2'], bloom_path='build.d/fp_prob_0.00001/ocrqa-wp_v1.0.6-de.bloom', fp_probability=1e-05, log_level='INFO', log_file='build.d/fp_prob_0.00001/ocrqa-wp_v1.0.6-de.bloom.log', config=None, min_frequency=2, single_char_min_frequency=20, diagnose_bloom=True)
2
+ 2025-02-14 16:07:47,840 ocrqa_create_bloom_filter.py:226 INFO: Starting Bloom Filter creation...
3
+ 2025-02-14 16:07:47,840 ocrqa_create_bloom_filter.py:178 INFO: Processing nonword file: lex/de/realword_ocr_errors.nw.txt
4
+ 2025-02-14 16:07:47,840 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['negierung']
5
+ 2025-02-14 16:07:47,840 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['negierungen']
6
+ 2025-02-14 16:07:47,840 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['nidwaiden']
7
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ölten']
8
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['unterwaiden']
9
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['verlausen']
10
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['vertretet']
11
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:190 INFO: Excluded 7 words that should never be added
12
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:178 INFO: Processing nonword file: lex/de/ocr_errors.nw.txt
13
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['0oo']
14
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['@d']
15
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['@e']
16
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['@i']
17
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['@r']
18
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['@t']
19
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['aargan']
20
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['abbin']
21
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['abgefetzt']
22
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['abgereift']
23
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ahresbesoldung']
24
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['aisbann']
25
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['aneh']
26
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['anmeldungstermln']
27
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ariesheim']
28
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['aueh']
29
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ausgefetzt']
30
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['auslände']
31
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bahmen']
32
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bandesblatt']
33
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bebacht']
34
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bechnung']
35
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['befetzt']
36
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['befetzten']
37
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['begelung']
38
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['begierung']
39
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['begierungen']
40
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['begierungsrat']
41
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['behorde']
42
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['behorden']
43
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['berieht']
44
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bersonen']
45
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['besicht']
46
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bestimm']
47
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['betragt']
48
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['betreifend']
49
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['beutscher']
50
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bevision']
51
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bielleicht']
52
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bingier']
53
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bnndesblatt']
54
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bücksicht']
55
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bundesbehorden']
56
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['diefe']
57
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['diefer']
58
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['dingungen']
59
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['dnrch']
60
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['dnreh']
61
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['dureh']
62
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eahmen']
63
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eappen']
64
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eäte']
65
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eatifikation']
66
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ebenfall']
67
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ebruar']
68
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eechnung']
69
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eecht']
70
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eechte']
71
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eechts']
72
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eegel']
73
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eegelung']
74
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eegierung']
75
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eegierungen']
76
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eegierungsrat']
77
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eeglement']
78
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eeihe']
79
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eente']
80
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eenten']
81
+ 2025-02-14 16:07:47,841 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eepublik']
82
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eesolution']
83
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eevision']
84
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ehur']
85
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eichter']
86
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eidgenossenschast']
87
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eidgenossische']
88
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eingefetzt']
89
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['einlabung']
90
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eirea']
91
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eiue']
92
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eldg']
93
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['elfaß']
94
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['endlieh']
95
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['engtischen']
96
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eobert']
97
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eolle']
98
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['erbalten']
99
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['erbbeben']
100
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['erfetzt']
101
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['erhallen']
102
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['erlauft']
103
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['erleiben']
104
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['erleibet']
105
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['erseht']
106
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['eücksicht']
107
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['euenburg']
108
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['feiet']
109
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['feinet']
110
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['feite']
111
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['festgefetzt']
112
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['fetze']
113
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['fetzte']
114
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['fetzten']
115
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['fetzung']
116
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['feuersbrunft']
117
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['fiir']
118
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['fipoi']
119
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['fllr']
120
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['fönst']
121
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['fortfetzen']
122
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['fortfetzung']
123
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['fortgefetzt']
124
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['franzofen']
125
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['franzosischen']
126
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['frauken']
127
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['galleu']
128
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['gefetzt']
129
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['gemass']
130
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['gesellschast']
131
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['gesellsehast']
132
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['gewöhn']
133
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['gierung']
134
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['gischen']
135
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['grossere']
136
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['grossern']
137
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['hallung']
138
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['handelsund']
139
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['hauptfache']
140
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['heuligen']
141
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ì000']
142
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['iaht']
143
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['iahte']
144
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['iahten']
145
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['iiber']
146
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['infofern']
147
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['jnni']
148
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['kauton']
149
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['kautone']
150
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['kautons']
151
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['korden']
152
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['kreispostdirektiou']
153
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['leife']
154
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['liier']
155
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['lnzern']
156
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['locamo']
157
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['lostet']
158
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['luzernburg']
159
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['macbonalb']
160
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['mahnahmen']
161
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['mahregeln']
162
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['mährend']
163
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['matznahmen']
164
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['melben']
165
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['melche']
166
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ministet']
167
+ 2025-02-14 16:07:47,842 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['mitleib']
168
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['moglich']
169
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['möglid']
170
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['moglieh']
171
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['naeh']
172
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['nieht']
173
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['nikiaus']
174
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['noeh']
175
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['nothig']
176
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['nothigen']
177
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ollem']
178
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['poft']
179
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['prankreich']
180
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['rebakteur']
181
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['reiburg']
182
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['reuenburg']
183
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['roieber']
184
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['rovember']
185
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ruffischen']
186
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['schisse']
187
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['schwierigleiten']
188
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['sehengen']
189
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['seihst']
190
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['siud']
191
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['srühern']
192
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['stanben']
193
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['stobt']
194
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['tatfache']
195
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['tatfachen']
196
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['teten']
197
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['thronrebe']
198
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['uater']
199
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['uicht']
200
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['unbein']
201
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['unfete']
202
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['unier']
203
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['unterschieb']
204
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['urfache']
205
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ürich']
206
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['verfetzt']
207
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['verkau']
208
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['verkauten']
209
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['verlauft']
210
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['verlaus']
211
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['verlehr']
212
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['vorfitz']
213
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['vstrr']
214
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['webet']
215
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['welehe']
216
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['wnrde']
217
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['znm']
218
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['zurlch']
219
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:190 INFO: Excluded 213 words that should never be added
220
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:196 INFO: Processing real-word file: lex/de/old_spelling.rw.txt
221
+ 2025-02-14 16:07:47,843 ocrqa_create_bloom_filter.py:196 INFO: Processing real-word file: lex/de/modern_spelling.rw.txt
222
+ 2025-02-14 16:07:47,844 ocrqa_create_bloom_filter.py:135 INFO: Processing frequency file: lex/de/dewiki.unigram.freq.tsv.bz2
223
+ 2025-02-14 16:08:00,551 ocrqa_create_bloom_filter.py:240 INFO: low_freq_excluded before removing parts from high-frequency words: 3780824
224
+ 2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:248 INFO: low_freq_excluded after removing parts from high-frequency words: 3288723
225
+ 2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:252 INFO: Lexical processing complete.
226
+ 2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:256 INFO: - nonwords_read: 213
227
+ 2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:256 INFO: - nonwords_count: 213
228
+ 2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:256 INFO: - realwords_read: 488
229
+ 2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:256 INFO: - realwords_accepted: 488
230
+ 2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:256 INFO: - realwords_nonwords_filtered: 0
231
+ 2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:256 INFO: - freq_words_read: 9199714
232
+ 2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:256 INFO: - words_accepted: 4143119
233
+ 2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:256 INFO: - subwords_accepted: 5029202
234
+ 2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:256 INFO: - subwords_filtered: 1
235
+ 2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:256 INFO: - low_freq_excluded: 3288723
236
+ 2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:256 INFO: - single_char_words_filtered: 6719
237
+ 2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:256 INFO: - freq_words_filtered: 5049876
238
+ 2025-02-14 16:08:00,724 ocrqa_create_bloom_filter.py:259 INFO: Estimated word count: 3358453
239
+ 2025-02-14 16:08:01,663 ocrqa_create_bloom_filter.py:263 INFO: Bloom Filter created and saved to build.d/fp_prob_0.00001/ocrqa-wp_v1.0.6-de.bloom
240
+ 2025-02-14 16:08:02,322 ocrqa_create_bloom_filter.py:285 INFO: Diagnosis Results:
241
+ 2025-02-14 16:08:02,323 ocrqa_create_bloom_filter.py:286 INFO: - Excluded words in bloom filter: 0
242
+ 2025-02-14 16:08:02,323 ocrqa_create_bloom_filter.py:287 INFO: - Known words not in bloom filter: 0
243
+ 2025-02-14 16:08:02,821 ocrqa_create_bloom_filter.py:294 INFO: - Low-frequency words in bloom filter: 25
244
+ 2025-02-14 16:08:02,821 ocrqa_create_bloom_filter.py:300 INFO: - Proportion of excluded words in bloom filter: 0.00000000
245
+ 2025-02-14 16:08:02,821 ocrqa_create_bloom_filter.py:306 INFO: - Proportion of known words not in bloom filter: 0.00000000
246
+ 2025-02-14 16:08:02,821 ocrqa_create_bloom_filter.py:314 INFO: - Proportion of low-frequency words in bloom filter: 0.00000760
models/ocrqa-wp_v1.0.6-fr.bloom.log ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-02-14 16:08:03,171 ocrqa_create_bloom_filter.py:425 INFO: Namespace(input_files=['lex/fr/realword_ocr_errors.nw.txt', 'lex/fr/ocr_errors.nw.txt', 'lex/fr/old_spelling.rw.txt', 'lex/fr/modern_spelling.rw.txt', 'lex/fr/morph.rw.txt.bz2', 'lex/fr/single_char.rw.txt.bz2', 'lex/fr/frwiki.unigram.freq.tsv.bz2'], bloom_path='build.d/fp_prob_0.00001/ocrqa-wp_v1.0.6-fr.bloom', fp_probability=1e-05, log_level='INFO', log_file='build.d/fp_prob_0.00001/ocrqa-wp_v1.0.6-fr.bloom.log', config=None, min_frequency=2, single_char_min_frequency=20, diagnose_bloom=True)
2
+ 2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:226 INFO: Starting Bloom Filter creation...
3
+ 2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:178 INFO: Processing nonword file: lex/fr/realword_ocr_errors.nw.txt
4
+ 2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['gazelle']
5
+ 2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:190 INFO: Excluded 1 words that should never be added
6
+ 2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:178 INFO: Processing nonword file: lex/fr/ocr_errors.nw.txt
7
+ 2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['bàie']
8
+ 2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['chepin']
9
+ 2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['dftce']
10
+ 2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ì']
11
+ 2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['locamo']
12
+ 2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['mmmi']
13
+ 2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['mmml']
14
+ 2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['mmmv']
15
+ 2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['neuchat']
16
+ 2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['neuchàtel']
17
+ 2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ölten']
18
+ 2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['ponr']
19
+ 2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['pribourg']
20
+ 2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['remercîments']
21
+ 2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['sadresser']
22
+ 2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:187 INFO: Nonword tokens: ['zuberbuhler']
23
+ 2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:190 INFO: Excluded 17 words that should never be added
24
+ 2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:196 INFO: Processing real-word file: lex/fr/old_spelling.rw.txt
25
+ 2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:196 INFO: Processing real-word file: lex/fr/modern_spelling.rw.txt
26
+ 2025-02-14 16:08:03,172 ocrqa_create_bloom_filter.py:196 INFO: Processing real-word file: lex/fr/morph.rw.txt.bz2
27
+ 2025-02-14 16:08:03,856 ocrqa_create_bloom_filter.py:196 INFO: Processing real-word file: lex/fr/single_char.rw.txt.bz2
28
+ 2025-02-14 16:08:03,857 ocrqa_create_bloom_filter.py:135 INFO: Processing frequency file: lex/fr/frwiki.unigram.freq.tsv.bz2
29
+ 2025-02-14 16:08:08,932 ocrqa_create_bloom_filter.py:240 INFO: low_freq_excluded before removing parts from high-frequency words: 1548627
30
+ 2025-02-14 16:08:09,031 ocrqa_create_bloom_filter.py:248 INFO: low_freq_excluded after removing parts from high-frequency words: 1224019
31
+ 2025-02-14 16:08:09,031 ocrqa_create_bloom_filter.py:252 INFO: Lexical processing complete.
32
+ 2025-02-14 16:08:09,031 ocrqa_create_bloom_filter.py:256 INFO: - nonwords_read: 17
33
+ 2025-02-14 16:08:09,031 ocrqa_create_bloom_filter.py:256 INFO: - nonwords_count: 17
34
+ 2025-02-14 16:08:09,031 ocrqa_create_bloom_filter.py:256 INFO: - realwords_read: 723486
35
+ 2025-02-14 16:08:09,031 ocrqa_create_bloom_filter.py:256 INFO: - realwords_accepted: 737153
36
+ 2025-02-14 16:08:09,031 ocrqa_create_bloom_filter.py:256 INFO: - realwords_nonwords_filtered: 1
37
+ 2025-02-14 16:08:09,031 ocrqa_create_bloom_filter.py:256 INFO: - freq_words_read: 4047539
38
+ 2025-02-14 16:08:09,031 ocrqa_create_bloom_filter.py:256 INFO: - words_accepted: 1992185
39
+ 2025-02-14 16:08:09,032 ocrqa_create_bloom_filter.py:256 INFO: - subwords_accepted: 2385659
40
+ 2025-02-14 16:08:09,032 ocrqa_create_bloom_filter.py:256 INFO: - subwords_filtered: 9
41
+ 2025-02-14 16:08:09,032 ocrqa_create_bloom_filter.py:256 INFO: - low_freq_excluded: 1224019
42
+ 2025-02-14 16:08:09,032 ocrqa_create_bloom_filter.py:256 INFO: - single_char_words_filtered: 6597
43
+ 2025-02-14 16:08:09,032 ocrqa_create_bloom_filter.py:256 INFO: - freq_words_filtered: 2048757
44
+ 2025-02-14 16:08:09,032 ocrqa_create_bloom_filter.py:259 INFO: Estimated word count: 1937683
45
+ 2025-02-14 16:08:09,562 ocrqa_create_bloom_filter.py:263 INFO: Bloom Filter created and saved to build.d/fp_prob_0.00001/ocrqa-wp_v1.0.6-fr.bloom
46
+ 2025-02-14 16:08:09,924 ocrqa_create_bloom_filter.py:285 INFO: Diagnosis Results:
47
+ 2025-02-14 16:08:09,924 ocrqa_create_bloom_filter.py:286 INFO: - Excluded words in bloom filter: 0
48
+ 2025-02-14 16:08:09,924 ocrqa_create_bloom_filter.py:287 INFO: - Known words not in bloom filter: 0
49
+ 2025-02-14 16:08:10,101 ocrqa_create_bloom_filter.py:294 INFO: - Low-frequency words in bloom filter: 15
50
+ 2025-02-14 16:08:10,101 ocrqa_create_bloom_filter.py:300 INFO: - Proportion of excluded words in bloom filter: 0.00000000
51
+ 2025-02-14 16:08:10,101 ocrqa_create_bloom_filter.py:306 INFO: - Proportion of known words not in bloom filter: 0.00000000
52
+ 2025-02-14 16:08:10,101 ocrqa_create_bloom_filter.py:314 INFO: - Proportion of low-frequency words in bloom filter: 0.00001225
lang_detect.py → ocr_qa_assessment.py RENAMED
@@ -1,7 +1,7 @@
1
  from transformers import Pipeline
2
 
3
 
4
- class LangDetectionPipeline(Pipeline):
5
 
6
  def _sanitize_parameters(self, **kwargs):
7
  preprocess_kwargs = {}
 
1
  from transformers import Pipeline
2
 
3
 
4
+ class QAAssessmentPipeline(Pipeline):
5
 
6
  def _sanitize_parameters(self, **kwargs):
7
  preprocess_kwargs = {}
test.py DELETED
@@ -1,13 +0,0 @@
1
- from transformers import AutoModelForTokenClassification, AutoTokenizer
2
- from transformers import pipeline
3
-
4
- MODEL_NAME = "emanuelaboros/lang-detect"
5
-
6
- lang_pipeline = pipeline(
7
- "lang-detect", model=MODEL_NAME, trust_remote_code=True, device="cpu"
8
- )
9
-
10
- sentence = "En l'an 1348, au plus fort des ravages de la peste noire à travers l'Europe, le Royaume de France se trouvait à la fois au bord du désespoir et face à une opportunité. À la cour du roi Philippe VI, les murs du Louvre étaient animés par les rapports sombres venus de Paris et des villes environnantes. La peste ne montrait aucun signe de répit, et le chancelier Guillaume de Nogaret, le conseiller le plus fidèle du roi, portait le lourd fardeau de gérer la survie du royaume."
11
-
12
- langs = lang_pipeline(sentence)
13
- langs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_floret.py DELETED
@@ -1,10 +0,0 @@
1
- import floret
2
-
3
- model_floret = floret.load_model("LID-40-3-2000000-1-4.bin")
4
- print(type(model_floret))
5
- input_ids = 'this is a text'
6
-
7
- input_ids = "En l'an 1348, au plus fort des ravages de la peste noire à travers l'Europe, le Royaume de France se trouvait à la fois au bord du désespoir et face à une opportunité. À la cour du roi Philippe VI, les murs du Louvre étaient animés par les rapports sombres venus de Paris et des villes environnantes. La peste ne montrait aucun signe de répit, et le chancelier Guillaume de Nogaret, le conseiller le plus fidèle du roi, portait le lourd fardeau de gérer la survie du royaume."
8
-
9
- print(model_floret.predict([input_ids], k=1))
10
-