.. _sphx_glr_examples_duplicate_detection_example.py: Duplicate Detection Example [REST API] -------------------------------------- Find near-duplicates in a text collection .. rst-class:: sphx-glr-script-out Out:: 0. Load the test dataset GET http://localhost:5001/api/v0/datasets/treclegal09_2k_subset 1.a Load dataset and initalize feature extraction POST http://localhost:5001/api/v0/feature-extraction => received [u'id', u'filenames'] => dsid = 1f71880c97614854b8bd6d0e81f123b0 1.b Run feature extraction POST http://localhost:5001/api/v0/feature-extraction/1f71880c97614854b8bd6d0e81f123b0 1.d. check the parameters of the extracted features GET http://localhost:5001/api/v0/feature-extraction/1f71880c97614854b8bd6d0e81f123b0 - binary: False - n_jobs: -1 - stop_words: english - use_hashing: False - min_df: 4.0 - n_samples: 2465 - analyzer: word - ngram_range: [1, 1] - max_df: 0.75 - chunk_size: 2000 - use_idf: True - data_dir: ../freediscovery_shared/treclegal09_2k_subset/data - sublinear_tf: False - n_samples_processed: 2465 - n_features: 30001 - norm: l2 2. Near Duplicates detection by cosine similarity (DBSCAN) POST http://localhost:5001/api/v0/clustering/dbscan/ => model id = 31cd41f986224c669eeeb97b0470b4b5 GET http://localhost:5001/api/v0/clustering/dbscan/31cd41f986224c669eeeb97b0470b4b5 .. computed in 44.0s Found 91 duplicates / 2465 3. Near Duplicates Detection using I-Match POST http://localhost:5001/api/v0/duplicate-detection/ => model id = 89f3938ed2854af2a0fffd75b4dc93cf .. computed in 0.0s GET http://localhost:5001/api/v0/duplicate-detection/89f3938ed2854af2a0fffd75b4dc93cf .. computed in 2.6s Found 238 duplicates / 2465 3. Duplicate detection by Simhash POST http://localhost:5001/api/v0/duplicate-detection/ => model id = e83bc05f61434311aeae67d5a9a2d937 .. computed in 0.8s GET http://localhost:5001/api/v0/duplicate-detection/e83bc05f61434311aeae67d5a9a2d937 .. computed in 0.1s Found 303 duplicates / 2465 GET http://localhost:5001/api/v0/duplicate-detection/e83bc05f61434311aeae67d5a9a2d937 .. computed in 0.1s Found 303 duplicates / 2465 4.a Delete the extracted features DELETE http://localhost:5001/api/v0/feature-extraction/1f71880c97614854b8bd6d0e81f123b0 | .. code-block:: python from __future__ import print_function from time import time import sys import platform import numpy as np import pandas as pd import requests pd.options.display.float_format = '{:,.3f}'.format dataset_name = "treclegal09_2k_subset" # see list of available datasets BASE_URL = "http://localhost:5001/api/v0" # FreeDiscovery server URL print(" 0. Load the test dataset") url = BASE_URL + '/datasets/{}'.format(dataset_name) print(" GET", url) res = requests.get(url) res = res.json() # To use a custom dataset, simply specify the following variables data_dir = res['data_dir'] # # 1. Feature extraction (non hashed) print("\n1.a Load dataset and initalize feature extraction") url = BASE_URL + '/feature-extraction' print(" POST", url) fe_opts = {'data_dir': data_dir, 'stop_words': 'english', 'chunk_size': 2000, 'n_jobs': -1, 'use_idf': 1, 'sublinear_tf': 0, 'binary': 0, 'n_features': 30001, 'analyzer': 'word', 'ngram_range': (1, 1), "norm": "l2", 'use_hashing': False, # hashing should be disabled for clustering 'min_df': 4, 'max_df': 0.75 } res = requests.post(url, json=fe_opts) dsid = res.json()['id'] print(" => received {}".format(list(res.json().keys()))) print(" => dsid = {}".format(dsid)) print("\n1.b Run feature extraction") # progress status is available for the hashed version only url = BASE_URL+'/feature-extraction/{}'.format(dsid) print(" POST", url) res = requests.post(url) print("\n1.d. check the parameters of the extracted features") url = BASE_URL + '/feature-extraction/{}'.format(dsid) print(' GET', url) res = requests.get(url) data = res.json() print('\n'.join([' - {}: {}'.format(key, val) for key, val in data.items() \ if "filenames" not in key])) print("\n2. Near Duplicates detection by cosine similarity (DBSCAN)") url = BASE_URL + '/clustering/dbscan/' print(" POST", url) t0 = time() res = requests.post(url, json={'dataset_id': dsid, 'lsi_components': 100, 'eps': 0.1, # 2*cosine distance for documents to be considered as duplicates 'n_max_samples': 2 }).json() mid = res['id'] print(" => model id = {}".format(mid)) url = BASE_URL + '/clustering/dbscan/{}'.format(mid) print(" GET", url) res = requests.get(url, json={'n_top_words': 0, # don't compute cluster labels }).json() t1 = time() print(' .. computed in {:.1f}s'.format(t1 - t0)) labels_ = res['labels'] print('Found {} duplicates / {}'.format(len(labels_) - len(np.unique(labels_)), len(labels_))) print("\n3. Near Duplicates Detection using I-Match") url = BASE_URL + '/duplicate-detection/' print(" POST", url) t0 = time() res = requests.post(url, json={'dataset_id': dsid, 'method': 'i-match', }) data = res.json() mid = data['id'] print(" => model id = {}".format(mid)) print(' .. computed in {:.1f}s'.format(time() - t0)) url = BASE_URL + '/duplicate-detection/{}'.format(mid) print(" GET", url) t0 = time() res = requests.get(url, json={'n_rand_lexicons': 10, 'rand_lexicon_ratio': 0.9}).json() t1 = time() print(' .. computed in {:.1f}s'.format(time() - t0)) labels_ = res['cluster_id'] print('Found {} duplicates / {}'.format(len(labels_) - len(np.unique(labels_)), len(labels_))) if platform.system() == 'Windows': print('Simhash-py is currently not implemented for Windows.') sys.exit() print("\n3. Duplicate detection by Simhash") url = BASE_URL + '/duplicate-detection/' print(" POST", url) t0 = time() res = requests.post(url, json={'dataset_id': dsid, 'method': 'simhash', }) data = res.json() mid = data['id'] print(" => model id = {}".format(mid)) print(' .. computed in {:.1f}s'.format(time() - t0)) url = BASE_URL + '/duplicate-detection/{}'.format(mid) print(" GET", url) t0 = time() res = requests.get(url, json={'distance': 1 }) data = res.json() print(' .. computed in {:.1f}s'.format(time() - t0)) labels_ = data['cluster_id'] print('Found {} duplicates / {}'.format(len(labels_) - len(np.unique(labels_)), len(labels_))) url = BASE_URL + '/duplicate-detection/{}'.format(mid) print(" GET", url) t0 = time() res = requests.get(url, json={'distance': 1 }) data = res.json() print(' .. computed in {:.1f}s'.format(time() - t0)) labels_ = data['cluster_id'] print('Found {} duplicates / {}'.format(len(labels_) - len(np.unique(labels_)), len(labels_))) # 4. Cleaning print("\n4.a Delete the extracted features") url = BASE_URL + '/feature-extraction/{}'.format(dsid) print(" DELETE", url) requests.delete(url) **Total running time of the script:** ( 0 minutes 49.804 seconds) .. container:: sphx-glr-footer .. container:: sphx-glr-download :download:`Download Python source code: duplicate_detection_example.py ` .. container:: sphx-glr-download :download:`Download Jupyter notebook: duplicate_detection_example.ipynb ` .. rst-class:: sphx-glr-signature `Generated by Sphinx-Gallery `_