.. _sphx_glr_examples_clustering_example.py: Clustering Example [REST API] ----------------------------- Cluster documents into clusters .. rst-class:: sphx-glr-script-out Out:: 0. Load the test dataset POST http://localhost:5001/api/v0/datasets/treclegal09_2k_subset 1.a Load dataset and initalize feature extraction POST http://localhost:5001/api/v0/feature-extraction => received [u'id', u'filenames'] => dsid = eb33f9e665314b23b00d9b17077bd02a 1.b Run feature extraction POST http://localhost:5001/api/v0/feature-extraction/eb33f9e665314b23b00d9b17077bd02a 1.d. check the parameters of the extracted features GET http://localhost:5001/api/v0/feature-extraction/eb33f9e665314b23b00d9b17077bd02a - binary: False - n_jobs: -1 - stop_words: english - use_hashing: False - min_df: 4.0 - n_samples: 2465 - analyzer: word - ngram_range: [1, 1] - max_df: 0.75 - chunk_size: 2000 - use_idf: True - data_dir: ../freediscovery_shared/treclegal09_2k_subset/data - sublinear_tf: True - n_samples_processed: 2465 - n_features: 30001 - norm: l2 2.a. Document clustering (LSI + K-means) POST http://localhost:5001/api/v0/clustering/k-mean/ => model id = d79f5cd2a3214057abf96ea79e81ad77 2.b. Computing cluster labels GET http://localhost:5001/api/v0/clustering/k-mean/d79f5cd2a3214057abf96ea79e81ad77 .. computed in 21.1s N_documents cluster_names 2 512 [u'enron', u'energy', u'trade', u'new', u'powe... 3 508 [u'recipients', u'administrative', u'group', u... 0 428 [u'tenet', u'test', u'oct', u'nov', u'tue', u'... 1 309 [u'shackleton', u'enron_development', u'ect', ... 9 256 [u'ect', u'hou', u'tana', u'jones', u'enron', ... 8 130 [u'shall', u'party', u'agreement', u'transacti... 4 103 [u'sanders', u'nov', u'ect', u'test', u'lunch'... 6 98 [u'nemec', u'ect', u'ruppert', u'hou', u'doc',... 7 64 [u'migration', u'outlook', u'team', u'mtg', u'... 5 57 [u'rewrite', u'server', u'address', u'smtp', u... 3.a. Document clustering (LSI + Ward HC) POST http://localhost:5001/api/v0/clustering/ward_hc/ => model id = c5a66afa9b7f408fb7cebbbd393fc23a 3.b. Computing cluster labels GET http://localhost:5001/api/v0/clustering/ward_hc/c5a66afa9b7f408fb7cebbbd393fc23a .. computed in 36.8s N_documents cluster_names 1 480 [u'recipients', u'administrative', u'group', u... 2 465 [u'enron', u'energy', u'trading', u'power', u'... 4 441 [u'tenet', u'test', u'oct', u'nov', u'tue', u'... 8 376 [u'shackleton', u'ect', u'test', u'hou', u'gro... 0 275 [u'ect', u'hou', u'nemec', u'group', u'test', ... 3 136 [u'shall', u'party', u'agreement', u'transacti... 7 101 [u'sanders', u'nov', u'ect', u'test', u'lunch'... 9 70 [u'enron_development', u'ect', u'shackleton', ... 5 64 [u'migration', u'outlook', u'team', u'mtg', u'... 6 57 [u'rewrite', u'server', u'address', u'smtp', u... 4.a Delete the extracted features DELETE http://localhost:5001/api/v0/feature-extraction/eb33f9e665314b23b00d9b17077bd02a | .. code-block:: python import numpy as np import pandas as pd from time import time import requests pd.options.display.float_format = '{:,.3f}'.format def repr_clustering(labels, terms): out = [] for ridx, row in enumerate(terms): out.append({'cluster_names': row, 'N_documents': (labels == ridx).sum()}) out = pd.DataFrame(out).sort_values('N_documents', ascending=False) return out dataset_name = "treclegal09_2k_subset" # see list of available datasets BASE_URL = "http://localhost:5001/api/v0" # FreeDiscovery server URL print(" 0. Load the test dataset") url = BASE_URL + '/datasets/{}'.format(dataset_name) print(" POST", url) res = requests.get(url).json() # To use a custom dataset, simply specify the following variables data_dir = res['data_dir'] # # 1. Feature extraction (non hashed) print("\n1.a Load dataset and initalize feature extraction") url = BASE_URL + '/feature-extraction' print(" POST", url) fe_opts = {'data_dir': data_dir, 'stop_words': 'english', 'chunk_size': 2000, 'n_jobs': -1, 'use_idf': 1, 'sublinear_tf': 1, 'binary': 0, 'n_features': 30001, 'analyzer': 'word', 'ngram_range': (1, 1), "norm": "l2", 'use_hashing': False, # hashing should be disabled for clustering 'min_df': 4, 'max_df': 0.75 } res = requests.post(url, json=fe_opts).json() dsid = res['id'] print(" => received {}".format(list(res.keys()))) print(" => dsid = {}".format(dsid)) print("\n1.b Run feature extraction") # progress status is available for the hashed version only url = BASE_URL+'/feature-extraction/{}'.format(dsid) print(" POST", url) res = requests.post(url) print("\n1.d. check the parameters of the extracted features") url = BASE_URL + '/feature-extraction/{}'.format(dsid) print(' GET', url) res = requests.get(url).json() print('\n'.join([' - {}: {}'.format(key, val) for key, val in res.items() \ if "filenames" not in key])) # # 2. Document Clustering (LSI + K-Means) print("\n2.a. Document clustering (LSI + K-means)") url = BASE_URL + '/clustering/k-mean/' print(" POST", url) t0 = time() res = requests.post(url, json={'dataset_id': dsid, 'n_clusters': 10, 'lsi_components': 50 }).json() mid = res['id'] print(" => model id = {}".format(mid)) print("\n2.b. Computing cluster labels") url = BASE_URL + '/clustering/k-mean/{}'.format(mid) print(" GET", url) res = requests.get(url, json={'n_top_words': 6 }).json() t1 = time() print(' .. computed in {:.1f}s'.format(t1 - t0)) print(repr_clustering(np.array(res['labels']), res['cluster_terms'])) # # 3. Document Clustering (LSI + Ward Hierarchical Clustering) print("\n3.a. Document clustering (LSI + Ward HC)") url = BASE_URL + '/clustering/ward_hc/' print(" POST", url) t0 = time() res = requests.post(url, json={'dataset_id': dsid, 'n_clusters': 10, 'lsi_components': 50, 'n_neighbors': 5 # this is the connectivity constraint }).json() mid = res['id'] print(" => model id = {}".format(mid)) print("\n3.b. Computing cluster labels") url = BASE_URL + '/clustering/ward_hc/{}'.format(mid) print(" GET", url) res = requests.get(url, json={'n_top_words': 6 }).json() t1 = time() print(' .. computed in {:.1f}s'.format(t1 - t0)) print(repr_clustering(np.array(res['labels']), res['cluster_terms'])) # 4. Cleaning print("\n4.a Delete the extracted features") url = BASE_URL + '/feature-extraction/{}'.format(dsid) print(" DELETE", url) requests.delete(url) **Total running time of the script:** ( 1 minutes 0.130 seconds) .. container:: sphx-glr-footer .. container:: sphx-glr-download :download:`Download Python source code: clustering_example.py ` .. container:: sphx-glr-download :download:`Download Jupyter notebook: clustering_example.ipynb ` .. rst-class:: sphx-glr-signature `Generated by Sphinx-Gallery `_