Duplicate Detection

Find near-duplicates in a text collection

from __future__ import print_function

from time import time

import pandas as pd
import requests

pd.options.display.float_format = '{:,.3f}'.format


dataset_name = "treclegal09_2k_subset"     # see list of available datasets

BASE_URL = "http://localhost:5001/api/v0"  # FreeDiscovery server URL

0. Load the test dataset

url = BASE_URL + '/example-dataset/{}'.format(dataset_name)
print(" GET", url)
input_ds = requests.get(url).json()


# To use a custom dataset, simply specify the following variables
data_dir = input_ds['metadata']['data_dir']

Out:

GET http://localhost:5001/api/v0/example-dataset/treclegal09_2k_subset

1. Feature extraction (non hashed)

print("\n1.a Load dataset and initalize feature extraction")
url = BASE_URL + '/feature-extraction'
print(" POST", url)

fe_opts = {'weighting': 'ntc',
           'n_features': 30001,
           'min_df': 4, 'max_df': 0.75
           }
res = requests.post(url, json=fe_opts)

dsid = res.json()['id']
print("   => received {}".format(list(res.json().keys())))
print("   => dsid = {}".format(dsid))

Out:

1.a Load dataset and initalize feature extraction
 POST http://localhost:5001/api/v0/feature-extraction
   => received ['id']
   => dsid = bbe8f0756df24039

1.b Run feature extraction

url = BASE_URL+'/feature-extraction/{}'.format(dsid)
print(" POST", url)
res = requests.post(url, json={"data_dir": data_dir})

print("\n1.d. check the parameters of the extracted features")
url = BASE_URL + '/feature-extraction/{}'.format(dsid)
print(' GET', url)
res = requests.get(url)

data = res.json()
print('\n'.join(['     - {}: {}'.format(key, val)
      for key, val in data.items() if "filenames" not in key]))

Out:

POST http://localhost:5001/api/v0/feature-extraction/bbe8f0756df24039

1.d. check the parameters of the extracted features
 GET http://localhost:5001/api/v0/feature-extraction/bbe8f0756df24039
     - analyzer: word
     - chunk_size: 5000
     - column_ids: None
     - column_separator: ,
     - data_dir: /home/ubuntu/freediscovery_shared/treclegal09_2k_subset/data/jobRun_4/XML_EXPORT_CONTENT/text_9
     - max_df: 0.75
     - min_df: 4.0
     - n_features: 30001
     - n_jobs: 1
     - n_samples: 2465
     - n_samples_processed: 2465
     - ngram_range: [1, 1]
     - norm_alpha: 0.75
     - parse_email_headers: False
     - preprocess: []
     - stop_words: english
     - use_hashing: False
     - weighting: ntc

2. Compute LSI

url = BASE_URL + '/lsi/'
print("POST", url)

n_components = 100
res = requests.post(url,
                    json={'n_components': n_components,
                          'parent_id': dsid
                          }).json()

lsi_id = res['id']

Out:

POST http://localhost:5001/api/v0/lsi/

3. Near Duplicates detection by cosine similarity (DBSCAN)

url = BASE_URL + '/clustering/dbscan/'
print(" POST", url)
t0 = time()
res = requests.post(url,
                    json={'parent_id': lsi_id,
                          'min_similarity': 0.90,
                          'n_max_samples': 2
                          }).json()

mid = res['id']
print("     => model id = {}".format(mid))

url = BASE_URL + '/clustering/dbscan/{}'.format(mid)
print(" GET", url)
# don't compute cluster labels
res = requests.get(url, json={'n_top_words': 0}).json()
t1 = time()

print('    .. computed in {:.1f}s'.format(t1 - t0))

data = res['data']
print('Found {} duplicates / {}'
      .format(sum([len(row['documents'])
                   for row in data if len(row['documents']) > 1]),
              len(input_ds['dataset'])))

Out:

POST http://localhost:5001/api/v0/clustering/dbscan/
     => model id = feb939e284b14fd1
 GET http://localhost:5001/api/v0/clustering/dbscan/feb939e284b14fd1
    .. computed in 6.8s
Found 233 duplicates / 2465

4. Near Duplicates Detection using I-Match

url = BASE_URL + '/duplicate-detection/'
print(" POST", url)
t0 = time()
res = requests.post(url, json={'parent_id': dsid,
                               'method': 'i-match'})

data = res.json()
mid = data['id']
print("     => model id = {}".format(mid))

print('    .. computed in {:.1f}s'.format(time() - t0))


url = BASE_URL + '/duplicate-detection/{}'.format(mid)
print(" GET", url)
t0 = time()
res = requests.get(url, json={'n_rand_lexicons': 10,
                              'rand_lexicon_ratio': 0.9}).json()
t1 = time()
print('    .. computed in {:.1f}s'.format(time() - t0))

data = res['data']

print('Found {} duplicates / {}'
      .format(sum([len(row['documents']) for row in data]),
              len(input_ds['dataset'])))

Out:

POST http://localhost:5001/api/v0/duplicate-detection/
     => model id = d5c30d1c90ec46b5
    .. computed in 0.0s
 GET http://localhost:5001/api/v0/duplicate-detection/d5c30d1c90ec46b5
    .. computed in 4.5s
Found 556 duplicates / 2465

3. Duplicate detection by Simhash

try:
    import simhash
    url = BASE_URL + '/duplicate-detection/'
    print(" POST", url)
    t0 = time()
    res = requests.post(url, json={'parent_id': dsid,
                                   'method': 'simhash'})

    data = res.json()
    mid = data['id']
    print("     => model id = {}".format(mid))

    print('    .. computed in {:.1f}s'.format(time() - t0))

    url = BASE_URL + '/duplicate-detection/{}'.format(mid)
    print(" GET", url)
    t0 = time()
    res = requests.get(url, json={'distance': 1})
    data = res.json()
    print('    .. computed in {:.1f}s'.format(time() - t0))

    data = data['data']

    print('Found {} duplicates / {}'
          .format(sum([len(row['documents']) for row in data]),
                  len(input_ds['dataset'])))
except ImportError:
    print("simhash is not installed or not supported "
          " (e.g. on Windows)")

Out:

POST http://localhost:5001/api/v0/duplicate-detection/
     => model id = 2092541707a645e6
    .. computed in 0.5s
 GET http://localhost:5001/api/v0/duplicate-detection/2092541707a645e6
    .. computed in 1.0s
Found 475 duplicates / 2465

4 Delete the extracted features

url = BASE_URL + '/feature-extraction/{}'.format(dsid)
requests.delete(url)

Total running time of the script: ( 0 minutes 15.555 seconds)

Generated by Sphinx-Gallery