Spaces:
Runtime error
Runtime error
Prabin Bhandari
commited on
Commit
·
d303927
1
Parent(s):
4d57961
Update the moduke
Browse files- app.py +1 -1
- cooccurrence_count.py +54 -50
- requirements.txt +3 -1
- tests.py +5 -14
app.py
CHANGED
@@ -3,4 +3,4 @@ from evaluate.utils import launch_gradio_widget
|
|
3 |
|
4 |
|
5 |
module = evaluate.load("prb977/cooccurrence_count")
|
6 |
-
launch_gradio_widget(module)
|
|
|
3 |
|
4 |
|
5 |
module = evaluate.load("prb977/cooccurrence_count")
|
6 |
+
launch_gradio_widget(module)
|
cooccurrence_count.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
-
# Copyright 2020 The HuggingFace Datasets Authors and the current
|
|
|
2 |
#
|
3 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
# you may not use this file except in compliance with the License.
|
@@ -11,85 +12,88 @@
|
|
11 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
# See the License for the specific language governing permissions and
|
13 |
# limitations under the License.
|
14 |
-
"""
|
|
|
|
|
|
|
15 |
|
16 |
import evaluate
|
17 |
import datasets
|
|
|
|
|
18 |
|
19 |
|
20 |
-
# TODO: Add BibTeX citation
|
21 |
-
_CITATION = """\
|
22 |
-
@InProceedings{huggingface:module,
|
23 |
-
title = {A great new module},
|
24 |
-
authors={huggingface, Inc.},
|
25 |
-
year={2020}
|
26 |
-
}
|
27 |
-
"""
|
28 |
-
|
29 |
-
# TODO: Add description of the module here
|
30 |
_DESCRIPTION = """\
|
31 |
-
|
32 |
"""
|
33 |
|
|
|
34 |
|
35 |
-
# TODO: Add description of the arguments of the module here
|
36 |
_KWARGS_DESCRIPTION = """
|
37 |
-
Calculates
|
38 |
Args:
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
reference should be a string with tokens separated by spaces.
|
43 |
Returns:
|
44 |
-
|
45 |
-
another_score: description of the second score,
|
46 |
Examples:
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
>>> my_new_module = evaluate.load("my_new_module")
|
51 |
-
>>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
|
52 |
>>> print(results)
|
53 |
-
{'
|
54 |
"""
|
55 |
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
58 |
|
59 |
|
60 |
-
@evaluate.utils.file_utils.add_start_docstrings(
|
|
|
|
|
|
|
61 |
class CooccurrenceCount(evaluate.Measurement):
|
62 |
-
"""
|
63 |
|
64 |
def _info(self):
|
65 |
-
# TODO: Specifies the evaluate.EvaluationModuleInfo object
|
66 |
return evaluate.MeasurementInfo(
|
67 |
-
# This is the description that will appear on the modules page.
|
68 |
module_type="measurement",
|
69 |
description=_DESCRIPTION,
|
70 |
citation=_CITATION,
|
71 |
inputs_description=_KWARGS_DESCRIPTION,
|
72 |
# This defines the format of each prediction and reference
|
73 |
features=datasets.Features({
|
74 |
-
'
|
75 |
-
'
|
|
|
76 |
}),
|
77 |
-
# Homepage of the module for documentation
|
78 |
-
homepage="http://module.homepage",
|
79 |
-
# Additional links to the codebase or references
|
80 |
-
codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
|
81 |
-
reference_urls=["http://path.to.reference.url/new_module"]
|
82 |
)
|
83 |
|
84 |
-
def
|
85 |
-
"""Optional: download external resources useful to compute the scores"""
|
86 |
-
# TODO: Download external resources if needed
|
87 |
-
pass
|
88 |
-
|
89 |
-
def _compute(self, predictions, references):
|
90 |
"""Returns the scores"""
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
return {
|
94 |
-
"
|
95 |
-
|
|
|
|
1 |
+
# Copyright 2020 The HuggingFace Datasets Authors and the current
|
2 |
+
# dataset script contributor.
|
3 |
#
|
4 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
# you may not use this file except in compliance with the License.
|
|
|
12 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
+
"""
|
16 |
+
Get the co-occurance count for two words in each sentece in a dataset.
|
17 |
+
"""
|
18 |
+
|
19 |
|
20 |
import evaluate
|
21 |
import datasets
|
22 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
23 |
+
import numpy as np
|
24 |
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
_DESCRIPTION = """\
|
27 |
+
Returns the co-occurrence count of two words in the input.
|
28 |
"""
|
29 |
|
30 |
+
_CITATION = ""
|
31 |
|
|
|
32 |
_KWARGS_DESCRIPTION = """
|
33 |
+
Calculates the co-occurence of two words in each sentence.
|
34 |
Args:
|
35 |
+
`data`: a list of `str` which containes a dataset.
|
36 |
+
`word1`: The first word.
|
37 |
+
`word2`: The second word.
|
|
|
38 |
Returns:
|
39 |
+
count: The co-occurrence count of word1 and word2 in data.
|
|
|
40 |
Examples:
|
41 |
+
>>> data = ["hello sun","hello moon", "hello sun"]
|
42 |
+
>>> c_count = evaluate.load("prb977/cooccurrence_count")
|
43 |
+
>>> results = c_count.compute(references=data, word1='hello', word2='sun')
|
|
|
|
|
44 |
>>> print(results)
|
45 |
+
{'count': 3, 'co_occurrence_count': 2}
|
46 |
"""
|
47 |
|
48 |
+
|
49 |
+
def check_count(x):
|
50 |
+
if x[0].all() <= 0:
|
51 |
+
return 0
|
52 |
+
return 1
|
53 |
|
54 |
|
55 |
+
@evaluate.utils.file_utils.add_start_docstrings(
|
56 |
+
_DESCRIPTION,
|
57 |
+
_KWARGS_DESCRIPTION
|
58 |
+
)
|
59 |
class CooccurrenceCount(evaluate.Measurement):
|
60 |
+
"""This measurement returns the co-occurrence count of two words."""
|
61 |
|
62 |
def _info(self):
|
|
|
63 |
return evaluate.MeasurementInfo(
|
|
|
64 |
module_type="measurement",
|
65 |
description=_DESCRIPTION,
|
66 |
citation=_CITATION,
|
67 |
inputs_description=_KWARGS_DESCRIPTION,
|
68 |
# This defines the format of each prediction and reference
|
69 |
features=datasets.Features({
|
70 |
+
'data': datasets.Value('string'),
|
71 |
+
'word1': datasets.Value('string'),
|
72 |
+
'word2': datasets.Value('string'),
|
73 |
}),
|
|
|
|
|
|
|
|
|
|
|
74 |
)
|
75 |
|
76 |
+
def _compute(self, data, word1, word2):
|
|
|
|
|
|
|
|
|
|
|
77 |
"""Returns the scores"""
|
78 |
+
len1 = len(word1.split(' '))
|
79 |
+
len2 = len(word2.split(' '))
|
80 |
+
gram = len1 if len1 > len2 else len2
|
81 |
+
v = CountVectorizer(ngram_range=(gram, gram))
|
82 |
+
analyzer = v.build_analyzer()
|
83 |
+
vectorizer = CountVectorizer(
|
84 |
+
ngram_range=(gram, gram),
|
85 |
+
vocabulary={
|
86 |
+
analyzer(word1)[0]: 0,
|
87 |
+
analyzer(word2)[0]: 1
|
88 |
+
}
|
89 |
+
)
|
90 |
+
co_occurrences = vectorizer.fit_transform(data)
|
91 |
+
dense_mat = co_occurrences.todense()
|
92 |
+
count = len(dense_mat)
|
93 |
+
co_occurrence_count = np.sum(
|
94 |
+
np.apply_along_axis(check_count, axis=1, arr=dense_mat)
|
95 |
+
)
|
96 |
return {
|
97 |
+
"cout": count,
|
98 |
+
"co_occurrence_count": co_occurrence_count,
|
99 |
+
}
|
requirements.txt
CHANGED
@@ -1 +1,3 @@
|
|
1 |
-
git+https://github.com/huggingface/evaluate@main
|
|
|
|
|
|
1 |
+
git+https://github.com/huggingface/evaluate@main
|
2 |
+
sklearn
|
3 |
+
numpy
|
tests.py
CHANGED
@@ -1,17 +1,8 @@
|
|
1 |
test_cases = [
|
2 |
{
|
3 |
-
"
|
4 |
-
"
|
5 |
-
"
|
|
|
6 |
},
|
7 |
-
|
8 |
-
"predictions": [1, 1],
|
9 |
-
"references": [1, 1],
|
10 |
-
"result": {"metric_score": 1}
|
11 |
-
},
|
12 |
-
{
|
13 |
-
"predictions": [1, 0],
|
14 |
-
"references": [1, 1],
|
15 |
-
"result": {"metric_score": 0.5}
|
16 |
-
}
|
17 |
-
]
|
|
|
1 |
test_cases = [
|
2 |
{
|
3 |
+
"data": ["hello sun", "hello moon", "hello sun"],
|
4 |
+
"word1": "hello",
|
5 |
+
"word2": "sun",
|
6 |
+
"result": {"count": 2}
|
7 |
},
|
8 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|