Clustering Example [REST API]ΒΆ
Cluster documents into clusters
Out:
0. Load the test dataset
POST http://localhost:5001/api/v0/datasets/treclegal09_2k_subset
1.a Load dataset and initalize feature extraction
POST http://localhost:5001/api/v0/feature-extraction
=> received [u'id', u'filenames']
=> dsid = eb33f9e665314b23b00d9b17077bd02a
1.b Run feature extraction
POST http://localhost:5001/api/v0/feature-extraction/eb33f9e665314b23b00d9b17077bd02a
1.d. check the parameters of the extracted features
GET http://localhost:5001/api/v0/feature-extraction/eb33f9e665314b23b00d9b17077bd02a
- binary: False
- n_jobs: -1
- stop_words: english
- use_hashing: False
- min_df: 4.0
- n_samples: 2465
- analyzer: word
- ngram_range: [1, 1]
- max_df: 0.75
- chunk_size: 2000
- use_idf: True
- data_dir: ../freediscovery_shared/treclegal09_2k_subset/data
- sublinear_tf: True
- n_samples_processed: 2465
- n_features: 30001
- norm: l2
2.a. Document clustering (LSI + K-means)
POST http://localhost:5001/api/v0/clustering/k-mean/
=> model id = d79f5cd2a3214057abf96ea79e81ad77
2.b. Computing cluster labels
GET http://localhost:5001/api/v0/clustering/k-mean/d79f5cd2a3214057abf96ea79e81ad77
.. computed in 21.1s
N_documents cluster_names
2 512 [u'enron', u'energy', u'trade', u'new', u'powe...
3 508 [u'recipients', u'administrative', u'group', u...
0 428 [u'tenet', u'test', u'oct', u'nov', u'tue', u'...
1 309 [u'shackleton', u'enron_development', u'ect', ...
9 256 [u'ect', u'hou', u'tana', u'jones', u'enron', ...
8 130 [u'shall', u'party', u'agreement', u'transacti...
4 103 [u'sanders', u'nov', u'ect', u'test', u'lunch'...
6 98 [u'nemec', u'ect', u'ruppert', u'hou', u'doc',...
7 64 [u'migration', u'outlook', u'team', u'mtg', u'...
5 57 [u'rewrite', u'server', u'address', u'smtp', u...
3.a. Document clustering (LSI + Ward HC)
POST http://localhost:5001/api/v0/clustering/ward_hc/
=> model id = c5a66afa9b7f408fb7cebbbd393fc23a
3.b. Computing cluster labels
GET http://localhost:5001/api/v0/clustering/ward_hc/c5a66afa9b7f408fb7cebbbd393fc23a
.. computed in 36.8s
N_documents cluster_names
1 480 [u'recipients', u'administrative', u'group', u...
2 465 [u'enron', u'energy', u'trading', u'power', u'...
4 441 [u'tenet', u'test', u'oct', u'nov', u'tue', u'...
8 376 [u'shackleton', u'ect', u'test', u'hou', u'gro...
0 275 [u'ect', u'hou', u'nemec', u'group', u'test', ...
3 136 [u'shall', u'party', u'agreement', u'transacti...
7 101 [u'sanders', u'nov', u'ect', u'test', u'lunch'...
9 70 [u'enron_development', u'ect', u'shackleton', ...
5 64 [u'migration', u'outlook', u'team', u'mtg', u'...
6 57 [u'rewrite', u'server', u'address', u'smtp', u...
4.a Delete the extracted features
DELETE http://localhost:5001/api/v0/feature-extraction/eb33f9e665314b23b00d9b17077bd02a
import numpy as np
import pandas as pd
from time import time
import requests
pd.options.display.float_format = '{:,.3f}'.format
def repr_clustering(labels, terms):
out = []
for ridx, row in enumerate(terms):
out.append({'cluster_names': row, 'N_documents': (labels == ridx).sum()})
out = pd.DataFrame(out).sort_values('N_documents', ascending=False)
return out
dataset_name = "treclegal09_2k_subset" # see list of available datasets
BASE_URL = "http://localhost:5001/api/v0" # FreeDiscovery server URL
print(" 0. Load the test dataset")
url = BASE_URL + '/datasets/{}'.format(dataset_name)
print(" POST", url)
res = requests.get(url).json()
# To use a custom dataset, simply specify the following variables
data_dir = res['data_dir']
# # 1. Feature extraction (non hashed)
print("\n1.a Load dataset and initalize feature extraction")
url = BASE_URL + '/feature-extraction'
print(" POST", url)
fe_opts = {'data_dir': data_dir,
'stop_words': 'english', 'chunk_size': 2000, 'n_jobs': -1,
'use_idf': 1, 'sublinear_tf': 1, 'binary': 0, 'n_features': 30001,
'analyzer': 'word', 'ngram_range': (1, 1), "norm": "l2",
'use_hashing': False, # hashing should be disabled for clustering
'min_df': 4, 'max_df': 0.75
}
res = requests.post(url, json=fe_opts).json()
dsid = res['id']
print(" => received {}".format(list(res.keys())))
print(" => dsid = {}".format(dsid))
print("\n1.b Run feature extraction")
# progress status is available for the hashed version only
url = BASE_URL+'/feature-extraction/{}'.format(dsid)
print(" POST", url)
res = requests.post(url)
print("\n1.d. check the parameters of the extracted features")
url = BASE_URL + '/feature-extraction/{}'.format(dsid)
print(' GET', url)
res = requests.get(url).json()
print('\n'.join([' - {}: {}'.format(key, val) for key, val in res.items() \
if "filenames" not in key]))
# # 2. Document Clustering (LSI + K-Means)
print("\n2.a. Document clustering (LSI + K-means)")
url = BASE_URL + '/clustering/k-mean/'
print(" POST", url)
t0 = time()
res = requests.post(url,
json={'dataset_id': dsid,
'n_clusters': 10,
'lsi_components': 50
}).json()
mid = res['id']
print(" => model id = {}".format(mid))
print("\n2.b. Computing cluster labels")
url = BASE_URL + '/clustering/k-mean/{}'.format(mid)
print(" GET", url)
res = requests.get(url,
json={'n_top_words': 6
}).json()
t1 = time()
print(' .. computed in {:.1f}s'.format(t1 - t0))
print(repr_clustering(np.array(res['labels']), res['cluster_terms']))
# # 3. Document Clustering (LSI + Ward Hierarchical Clustering)
print("\n3.a. Document clustering (LSI + Ward HC)")
url = BASE_URL + '/clustering/ward_hc/'
print(" POST", url)
t0 = time()
res = requests.post(url,
json={'dataset_id': dsid,
'n_clusters': 10,
'lsi_components': 50,
'n_neighbors': 5 # this is the connectivity constraint
}).json()
mid = res['id']
print(" => model id = {}".format(mid))
print("\n3.b. Computing cluster labels")
url = BASE_URL + '/clustering/ward_hc/{}'.format(mid)
print(" GET", url)
res = requests.get(url,
json={'n_top_words': 6
}).json()
t1 = time()
print(' .. computed in {:.1f}s'.format(t1 - t0))
print(repr_clustering(np.array(res['labels']), res['cluster_terms']))
# 4. Cleaning
print("\n4.a Delete the extracted features")
url = BASE_URL + '/feature-extraction/{}'.format(dsid)
print(" DELETE", url)
requests.delete(url)
Total running time of the script: ( 1 minutes 0.130 seconds)