/ Information Retrieval

Determining Similarity: Jaccard Index

The Jaccard index(Jaccard similarity coefficient) is a statistic used for comparing the similarity and diversity of sample sets. It measures similarity between finite sample sets, and is defined as the size of the intersection divided by the size of the union of the sample sets.

The problem of Jaccard Index

It doesn’t consider term frequency (how many occurrences a term has).
Rare terms are more informative than frequent terms.
Jaccard does not consider this information.
We need a more sophisticated way of normalizing for the length of a document.

Let's compute with Python!

# Jaccard Coefficient is equal to (A Intersect B) / (A Union B)

from typing import Set


def jaccard_index(first_set: Set, second_set: Set) -> Set:
    # If both sets are empty, jaccard index is defined to be 1
    index = 1.0
    if first_set or second_set:
        index = float(len(first_set.intersection(second_set))) / len(first_set.union(second_set))
        index = float("{0:.3f}".format(index))
        return index


def main(first_set, second_set):
    res = jaccard_index(first_set, second_set)
    print("\n first set: {first_set}\n"
          "second set: {second_set}\n"
          "jaccard_index: {res}"
          .format(first_set=first_set, second_set=second_set, res=res))
    return res


if __name__ == '__main__':
    main()

Test.py

import unittest
import jaccard_index as JI


class Test_Jaccard_Index(unittest.TestCase):
    def test_case_1(self):
        first_set = set(range(10))
        second_set = set(range(5, 20))
        res = JI.main(first_set, second_set)
        self.assertEqual(res, 0.25)

    def test_case_2(self):
        first_set = set("ides of March".lower().split())
        second_set = set("“Caesar died in March".lower().split())
        res = JI.main(first_set, second_set)
        self.assertEqual(res, 0.167)

    def test_case_3(self):
        first_str = "My home town is seoul, south Korea.".lower()
        second_str = "Where is your home town?".lower()
        k_gram = 2
        first_set = ngrams_split(list(first_str), k_gram)
        second_set = ngrams_split(list(second_str), k_gram)
        res = JI.main(first_set, second_set)
        self.assertEqual(res, 0.368)

    def test_case_4(self):
        first_set = set("ides of March".lower().split())
        second_set = set("ides of March".lower().split())
        res = JI.main(first_set, second_set)
        self.assertEqual(res, 1)


def ngrams_split(lst, n):
    return set([''.join(lst[i:i + n]) for i in range(len(lst) - n)])


if __name__ == '__main__':
    unittest.main()