Duplicate Detection Example [REST API]¶

Find near-duplicates in a text collection

Out:

0. Load the test dataset
 GET http://localhost:5001/api/v0/datasets/treclegal09_2k_subset

1.a Load dataset and initalize feature extraction
 POST http://localhost:5001/api/v0/feature-extraction
   => received [u'id', u'filenames']
   => dsid = 1f71880c97614854b8bd6d0e81f123b0

1.b Run feature extraction
 POST http://localhost:5001/api/v0/feature-extraction/1f71880c97614854b8bd6d0e81f123b0

1.d. check the parameters of the extracted features
 GET http://localhost:5001/api/v0/feature-extraction/1f71880c97614854b8bd6d0e81f123b0
     - binary: False
     - n_jobs: -1
     - stop_words: english
     - use_hashing: False
     - min_df: 4.0
     - n_samples: 2465
     - analyzer: word
     - ngram_range: [1, 1]
     - max_df: 0.75
     - chunk_size: 2000
     - use_idf: True
     - data_dir: ../freediscovery_shared/treclegal09_2k_subset/data
     - sublinear_tf: False
     - n_samples_processed: 2465
     - n_features: 30001
     - norm: l2

2. Near Duplicates detection by cosine similarity (DBSCAN)
 POST http://localhost:5001/api/v0/clustering/dbscan/
     => model id = 31cd41f986224c669eeeb97b0470b4b5
 GET http://localhost:5001/api/v0/clustering/dbscan/31cd41f986224c669eeeb97b0470b4b5
    .. computed in 44.0s
Found 91 duplicates / 2465

3. Near Duplicates Detection using I-Match
 POST http://localhost:5001/api/v0/duplicate-detection/
     => model id = 89f3938ed2854af2a0fffd75b4dc93cf
    .. computed in 0.0s
 GET http://localhost:5001/api/v0/duplicate-detection/89f3938ed2854af2a0fffd75b4dc93cf
    .. computed in 2.6s
Found 238 duplicates / 2465

3. Duplicate detection by Simhash
 POST http://localhost:5001/api/v0/duplicate-detection/
     => model id = e83bc05f61434311aeae67d5a9a2d937
    .. computed in 0.8s
 GET http://localhost:5001/api/v0/duplicate-detection/e83bc05f61434311aeae67d5a9a2d937
    .. computed in 0.1s
Found 303 duplicates / 2465
 GET http://localhost:5001/api/v0/duplicate-detection/e83bc05f61434311aeae67d5a9a2d937
    .. computed in 0.1s
Found 303 duplicates / 2465

4.a Delete the extracted features
 DELETE http://localhost:5001/api/v0/feature-extraction/1f71880c97614854b8bd6d0e81f123b0

from __future__ import print_function

from time import time
import sys
import platform

import numpy as np
import pandas as pd
import requests

pd.options.display.float_format = '{:,.3f}'.format


dataset_name = "treclegal09_2k_subset"     # see list of available datasets

BASE_URL = "http://localhost:5001/api/v0"  # FreeDiscovery server URL

print(" 0. Load the test dataset")
url = BASE_URL + '/datasets/{}'.format(dataset_name)
print(" GET", url)
res = requests.get(url)
res = res.json()

# To use a custom dataset, simply specify the following variables
data_dir = res['data_dir']
# # 1. Feature extraction (non hashed)

print("\n1.a Load dataset and initalize feature extraction")
url = BASE_URL + '/feature-extraction'
print(" POST", url)
fe_opts = {'data_dir': data_dir,
           'stop_words': 'english', 'chunk_size': 2000, 'n_jobs': -1,
           'use_idf': 1, 'sublinear_tf': 0, 'binary': 0, 'n_features': 30001,
           'analyzer': 'word', 'ngram_range': (1, 1), "norm": "l2",
           'use_hashing': False,  # hashing should be disabled for clustering
           'min_df': 4, 'max_df': 0.75
          }
res = requests.post(url, json=fe_opts)

dsid = res.json()['id']
print("   => received {}".format(list(res.json().keys())))
print("   => dsid = {}".format(dsid))


print("\n1.b Run feature extraction")
# progress status is available for the hashed version only
url = BASE_URL+'/feature-extraction/{}'.format(dsid)
print(" POST", url)
res = requests.post(url)

print("\n1.d. check the parameters of the extracted features")
url = BASE_URL + '/feature-extraction/{}'.format(dsid)
print(' GET', url)
res = requests.get(url)

data = res.json()
print('\n'.join(['     - {}: {}'.format(key, val) for key, val in data.items() \
                                                  if "filenames" not in key]))


print("\n2. Near Duplicates detection by cosine similarity (DBSCAN)")

url = BASE_URL + '/clustering/dbscan/'
print(" POST", url)
t0 = time()
res = requests.post(url,
        json={'dataset_id': dsid,
              'lsi_components': 100,
              'eps': 0.1,            # 2*cosine distance for documents to be considered as duplicates
              'n_max_samples': 2
              }).json()

mid  = res['id']
print("     => model id = {}".format(mid))

url = BASE_URL + '/clustering/dbscan/{}'.format(mid)
print(" GET", url)
res = requests.get(url,
        json={'n_top_words': 0, # don't compute cluster labels
              }).json()
t1 = time()

print('    .. computed in {:.1f}s'.format(t1 - t0))

labels_ = res['labels']

print('Found {} duplicates / {}'.format(len(labels_) - len(np.unique(labels_)), len(labels_)))


print("\n3. Near Duplicates Detection using I-Match")

url = BASE_URL + '/duplicate-detection/'
print(" POST", url)
t0 = time()
res = requests.post(url,
        json={'dataset_id': dsid,
              'method': 'i-match',
              })

data = res.json()
mid  = data['id']
print("     => model id = {}".format(mid))

print('    .. computed in {:.1f}s'.format(time() - t0))


url = BASE_URL + '/duplicate-detection/{}'.format(mid)
print(" GET", url)
t0 = time()
res = requests.get(url,
        json={'n_rand_lexicons': 10,
              'rand_lexicon_ratio': 0.9}).json()
t1 = time()
print('    .. computed in {:.1f}s'.format(time() - t0))

labels_ = res['cluster_id']

print('Found {} duplicates / {}'.format(len(labels_) - len(np.unique(labels_)), len(labels_)))



if platform.system() == 'Windows':
    print('Simhash-py is currently not implemented for Windows.')
    sys.exit()

print("\n3. Duplicate detection by Simhash")

url = BASE_URL + '/duplicate-detection/'
print(" POST", url)
t0 = time()
res = requests.post(url,
        json={'dataset_id': dsid,
              'method': 'simhash',
              })

data = res.json()
mid  = data['id']
print("     => model id = {}".format(mid))

print('    .. computed in {:.1f}s'.format(time() - t0))



url = BASE_URL + '/duplicate-detection/{}'.format(mid)
print(" GET", url)
t0 = time()
res = requests.get(url,
        json={'distance': 1 })
data = res.json()
print('    .. computed in {:.1f}s'.format(time() - t0))

labels_ = data['cluster_id']

print('Found {} duplicates / {}'.format(len(labels_) - len(np.unique(labels_)), len(labels_)))



url = BASE_URL + '/duplicate-detection/{}'.format(mid)
print(" GET", url)
t0 = time()
res = requests.get(url,
        json={'distance': 1 })
data = res.json()
print('    .. computed in {:.1f}s'.format(time() - t0))

labels_ = data['cluster_id']

print('Found {} duplicates / {}'.format(len(labels_) - len(np.unique(labels_)), len(labels_)))



# 4. Cleaning
print("\n4.a Delete the extracted features")
url = BASE_URL + '/feature-extraction/{}'.format(dsid)
print(" DELETE", url)
requests.delete(url)

Total running time of the script: ( 0 minutes 49.804 seconds)

Generated by Sphinx-Gallery