.. _sphx_glr_examples_clustering_example.py:


Clustering Example [REST API]
-----------------------------

Cluster documents into clusters


.. rst-class:: sphx-glr-script-out

 Out::

    0. Load the test dataset
     POST http://localhost:5001/api/v0/datasets/treclegal09_2k_subset

    1.a Load dataset and initalize feature extraction
     POST http://localhost:5001/api/v0/feature-extraction
       => received [u'id', u'filenames']
       => dsid = eb33f9e665314b23b00d9b17077bd02a

    1.b Run feature extraction
     POST http://localhost:5001/api/v0/feature-extraction/eb33f9e665314b23b00d9b17077bd02a

    1.d. check the parameters of the extracted features
     GET http://localhost:5001/api/v0/feature-extraction/eb33f9e665314b23b00d9b17077bd02a
         - binary: False
         - n_jobs: -1
         - stop_words: english
         - use_hashing: False
         - min_df: 4.0
         - n_samples: 2465
         - analyzer: word
         - ngram_range: [1, 1]
         - max_df: 0.75
         - chunk_size: 2000
         - use_idf: True
         - data_dir: ../freediscovery_shared/treclegal09_2k_subset/data
         - sublinear_tf: True
         - n_samples_processed: 2465
         - n_features: 30001
         - norm: l2

    2.a. Document clustering (LSI + K-means)
     POST http://localhost:5001/api/v0/clustering/k-mean/
         => model id = d79f5cd2a3214057abf96ea79e81ad77

    2.b. Computing cluster labels
     GET http://localhost:5001/api/v0/clustering/k-mean/d79f5cd2a3214057abf96ea79e81ad77
        .. computed in 21.1s
       N_documents                                      cluster_names
    2          512  [u'enron', u'energy', u'trade', u'new', u'powe...
    3          508  [u'recipients', u'administrative', u'group', u...
    0          428  [u'tenet', u'test', u'oct', u'nov', u'tue', u'...
    1          309  [u'shackleton', u'enron_development', u'ect', ...
    9          256  [u'ect', u'hou', u'tana', u'jones', u'enron', ...
    8          130  [u'shall', u'party', u'agreement', u'transacti...
    4          103  [u'sanders', u'nov', u'ect', u'test', u'lunch'...
    6           98  [u'nemec', u'ect', u'ruppert', u'hou', u'doc',...
    7           64  [u'migration', u'outlook', u'team', u'mtg', u'...
    5           57  [u'rewrite', u'server', u'address', u'smtp', u...

    3.a. Document clustering (LSI + Ward HC)
     POST http://localhost:5001/api/v0/clustering/ward_hc/
         => model id = c5a66afa9b7f408fb7cebbbd393fc23a

    3.b. Computing cluster labels
     GET http://localhost:5001/api/v0/clustering/ward_hc/c5a66afa9b7f408fb7cebbbd393fc23a
        .. computed in 36.8s
       N_documents                                      cluster_names
    1          480  [u'recipients', u'administrative', u'group', u...
    2          465  [u'enron', u'energy', u'trading', u'power', u'...
    4          441  [u'tenet', u'test', u'oct', u'nov', u'tue', u'...
    8          376  [u'shackleton', u'ect', u'test', u'hou', u'gro...
    0          275  [u'ect', u'hou', u'nemec', u'group', u'test', ...
    3          136  [u'shall', u'party', u'agreement', u'transacti...
    7          101  [u'sanders', u'nov', u'ect', u'test', u'lunch'...
    9           70  [u'enron_development', u'ect', u'shackleton', ...
    5           64  [u'migration', u'outlook', u'team', u'mtg', u'...
    6           57  [u'rewrite', u'server', u'address', u'smtp', u...

    4.a Delete the extracted features
     DELETE http://localhost:5001/api/v0/feature-extraction/eb33f9e665314b23b00d9b17077bd02a


|


.. code-block:: python


    import numpy as np
    import pandas as pd
    from time import time
    import requests

    pd.options.display.float_format = '{:,.3f}'.format


    def repr_clustering(labels, terms):
        out = []
        for ridx, row in enumerate(terms):
            out.append({'cluster_names': row, 'N_documents': (labels == ridx).sum()})
        out = pd.DataFrame(out).sort_values('N_documents', ascending=False)
        return out

    dataset_name = "treclegal09_2k_subset"     # see list of available datasets

    BASE_URL = "http://localhost:5001/api/v0"  # FreeDiscovery server URL

    print(" 0. Load the test dataset")
    url = BASE_URL + '/datasets/{}'.format(dataset_name)
    print(" POST", url)
    res = requests.get(url).json()

    # To use a custom dataset, simply specify the following variables
    data_dir = res['data_dir']

    # # 1. Feature extraction (non hashed)

    print("\n1.a Load dataset and initalize feature extraction")
    url = BASE_URL + '/feature-extraction'
    print(" POST", url)
    fe_opts = {'data_dir': data_dir,
               'stop_words': 'english', 'chunk_size': 2000, 'n_jobs': -1,
               'use_idf': 1, 'sublinear_tf': 1, 'binary': 0, 'n_features': 30001,
               'analyzer': 'word', 'ngram_range': (1, 1), "norm": "l2",
               'use_hashing': False,  # hashing should be disabled for clustering
               'min_df': 4, 'max_df': 0.75
               }
    res = requests.post(url, json=fe_opts).json()

    dsid = res['id']
    print("   => received {}".format(list(res.keys())))
    print("   => dsid = {}".format(dsid))


    print("\n1.b Run feature extraction")
    # progress status is available for the hashed version only
    url = BASE_URL+'/feature-extraction/{}'.format(dsid)
    print(" POST", url)
    res = requests.post(url)

    print("\n1.d. check the parameters of the extracted features")
    url = BASE_URL + '/feature-extraction/{}'.format(dsid)
    print(' GET', url)
    res = requests.get(url).json()

    print('\n'.join(['     - {}: {}'.format(key, val) for key, val in res.items() \
                                                      if "filenames" not in key]))


    # # 2. Document Clustering (LSI + K-Means)

    print("\n2.a. Document clustering (LSI + K-means)")

    url = BASE_URL + '/clustering/k-mean/'
    print(" POST", url)
    t0 = time()
    res = requests.post(url,
                        json={'dataset_id': dsid,
                              'n_clusters': 10,
                              'lsi_components': 50
                              }).json()

    mid = res['id']
    print("     => model id = {}".format(mid))

    print("\n2.b. Computing cluster labels")
    url = BASE_URL + '/clustering/k-mean/{}'.format(mid)
    print(" GET", url)
    res = requests.get(url,
                       json={'n_top_words': 6
                             }).json()
    t1 = time()

    print('    .. computed in {:.1f}s'.format(t1 - t0))
    print(repr_clustering(np.array(res['labels']), res['cluster_terms']))


    # # 3. Document Clustering (LSI + Ward Hierarchical Clustering)

    print("\n3.a. Document clustering (LSI + Ward HC)")

    url = BASE_URL + '/clustering/ward_hc/'
    print(" POST", url)
    t0 = time()
    res = requests.post(url,
                        json={'dataset_id': dsid,
                              'n_clusters': 10,
                              'lsi_components': 50,
                              'n_neighbors': 5  # this is the connectivity constraint
                              }).json()

    mid = res['id']
    print("     => model id = {}".format(mid))

    print("\n3.b. Computing cluster labels")
    url = BASE_URL + '/clustering/ward_hc/{}'.format(mid)
    print(" GET", url)
    res = requests.get(url,
                       json={'n_top_words': 6
                             }).json()
    t1 = time()

    print('    .. computed in {:.1f}s'.format(t1 - t0))
    print(repr_clustering(np.array(res['labels']), res['cluster_terms']))


    # 4. Cleaning
    print("\n4.a Delete the extracted features")
    url = BASE_URL + '/feature-extraction/{}'.format(dsid)
    print(" DELETE", url)
    requests.delete(url)

**Total running time of the script:** ( 1 minutes  0.130 seconds)


.. container:: sphx-glr-footer


  .. container:: sphx-glr-download

     :download:`Download Python source code: clustering_example.py <clustering_example.py>`


  .. container:: sphx-glr-download

     :download:`Download Jupyter notebook: clustering_example.ipynb <clustering_example.ipynb>`

.. rst-class:: sphx-glr-signature

    `Generated by Sphinx-Gallery <http://sphinx-gallery.readthedocs.io>`_