Spaces:

prb977
/

cooccurrence_count

Runtime error

App Files Files Community

Prabin Bhandari commited on Aug 24, 2022

Commit

d303927

1 Parent(s): 4d57961

Update the moduke

Browse files

Files changed (4) hide show

app.py +1 -1
cooccurrence_count.py +54 -50
requirements.txt +3 -1
tests.py +5 -14

app.py CHANGED Viewed

@@ -3,4 +3,4 @@ from evaluate.utils import launch_gradio_widget
 module = evaluate.load("prb977/cooccurrence_count")
-launch_gradio_widget(module)


3
4
5	module = evaluate.load("prb977/cooccurrence_count")
6	+ launch_gradio_widget(module)

cooccurrence_count.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,85 +12,88 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""TODO: Add a description here."""
 import evaluate
 import datasets
-# TODO: Add BibTeX citation
-_CITATION = """\
-@InProceedings{huggingface:module,
-title = {A great new module},
-authors={huggingface, Inc.},
-year={2020}
-}
-"""
-# TODO: Add description of the module here
 _DESCRIPTION = """\
-This new module is designed to solve this great ML task and is crafted with a lot of care.
 """
-# TODO: Add description of the arguments of the module here
 _KWARGS_DESCRIPTION = """
-Calculates how good are predictions given some references, using certain scores
 Args:
-    predictions: list of predictions to score. Each predictions
-        should be a string with tokens separated by spaces.
-    references: list of reference for each prediction. Each
-        reference should be a string with tokens separated by spaces.
 Returns:
-    accuracy: description of the first score,
-    another_score: description of the second score,
 Examples:
-    Examples should be written in doctest format, and should illustrate how
-    to use the function.
-    >>> my_new_module = evaluate.load("my_new_module")
-    >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
     >>> print(results)
-    {'accuracy': 1.0}
 """
-# TODO: Define external resources urls if needed
-BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
-@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class CooccurrenceCount(evaluate.Measurement):
-    """TODO: Short description of my evaluation module."""
     def _info(self):
-        # TODO: Specifies the evaluate.EvaluationModuleInfo object
         return evaluate.MeasurementInfo(
-            # This is the description that will appear on the modules page.
             module_type="measurement",
             description=_DESCRIPTION,
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
             features=datasets.Features({
-                'predictions': datasets.Value('int64'),
-                'references': datasets.Value('int64'),
             }),
-            # Homepage of the module for documentation
-            homepage="http://module.homepage",
-            # Additional links to the codebase or references
-            codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
-            reference_urls=["http://path.to.reference.url/new_module"]
         )
-    def _download_and_prepare(self, dl_manager):
-        """Optional: download external resources useful to compute the scores"""
-        # TODO: Download external resources if needed
-        pass
-    def _compute(self, predictions, references):
         """Returns the scores"""
-        # TODO: Compute the different scores of the module
-        accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
         return {
-            "accuracy": accuracy,
-        }

+# Copyright 2020 The HuggingFace Datasets Authors and the current
+# dataset script contributor.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+Get the co-occurance count for two words in each sentece in a dataset.
+"""
 import evaluate
 import datasets
+from sklearn.feature_extraction.text import CountVectorizer
+import numpy as np
 _DESCRIPTION = """\
+Returns the co-occurrence count of two words in the input.
 """
+_CITATION = ""
 _KWARGS_DESCRIPTION = """
+Calculates the co-occurence of two words in each sentence.
 Args:
+    `data`: a list of `str` which containes a dataset.
+    `word1`: The first word.
+    `word2`: The second word.
 Returns:
+    count: The co-occurrence count of word1 and word2 in data.
 Examples:
+    >>> data = ["hello sun","hello moon", "hello sun"]
+    >>> c_count = evaluate.load("prb977/cooccurrence_count")
+    >>> results = c_count.compute(references=data, word1='hello', word2='sun')
     >>> print(results)
+    {'count': 3, 'co_occurrence_count': 2}
 """
+def check_count(x):
+    if x[0].all() <= 0:
+        return 0
+    return 1
+@evaluate.utils.file_utils.add_start_docstrings(
+    _DESCRIPTION,
+    _KWARGS_DESCRIPTION
+)
 class CooccurrenceCount(evaluate.Measurement):
+    """This measurement returns the co-occurrence count of two words."""
     def _info(self):
         return evaluate.MeasurementInfo(
             module_type="measurement",
             description=_DESCRIPTION,
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
             features=datasets.Features({
+                'data': datasets.Value('string'),
+                'word1': datasets.Value('string'),
+                'word2': datasets.Value('string'),
             }),
         )
+    def _compute(self, data, word1, word2):
         """Returns the scores"""
+        len1 = len(word1.split(' '))
+        len2 = len(word2.split(' '))
+        gram = len1 if len1 > len2 else len2
+        v = CountVectorizer(ngram_range=(gram, gram))
+        analyzer = v.build_analyzer()
+        vectorizer = CountVectorizer(
+            ngram_range=(gram, gram),
+            vocabulary={
+                analyzer(word1)[0]: 0,
+                analyzer(word2)[0]: 1
+            }
+        )
+        co_occurrences = vectorizer.fit_transform(data)
+        dense_mat = co_occurrences.todense()
+        count = len(dense_mat)
+        co_occurrence_count = np.sum(
+            np.apply_along_axis(check_count, axis=1, arr=dense_mat)
+        )
         return {
+            "cout": count,
+            "co_occurrence_count": co_occurrence_count,
+        }

requirements.txt CHANGED Viewed

	@@ -1 +1,3 @@
1	- git+https://github.com/huggingface/evaluate@main

+git+https://github.com/huggingface/evaluate@main
+sklearn
+numpy

tests.py CHANGED Viewed

@@ -1,17 +1,8 @@
 test_cases = [
     {
-        "predictions": [0, 0],
-        "references": [1, 1],
-        "result": {"metric_score": 0}
     },
-    {
-        "predictions": [1, 1],
-        "references": [1, 1],
-        "result": {"metric_score": 1}
-    },
-    {
-        "predictions": [1, 0],
-        "references": [1, 1],
-        "result": {"metric_score": 0.5}
-    }
-]

 test_cases = [
     {
+        "data": ["hello sun", "hello moon", "hello sun"],
+        "word1": "hello",
+        "word2": "sun",
+        "result": {"count": 2}
     },
+]