.. _sphx_glr_examples_clustering_example_python.py:


Clustering Example [Python API]
-------------------------------

An example of clustering using the Python API


.. rst-class:: sphx-glr-script-out

 Out::

    0. Load Dataset

    Warning: downloading dataset treclegal09_2k_subset (2.8 MB) !

    File ../treclegal09_2k_subset.tar.gz downloaded!
    Archive extracted!

    1. Feature extraction (non hashed)


    2. Document Clustering (LSI + K-Means)

        .. computed in 20.6s
       N_documents                                      cluster_names
    3          599           [alias, enron, norm, ect, calo, changed]
    5          393  [ect, hou, enron, group, recipients, administr...
    4          328  [teneo, recipients, group, administrative, tes...
    1          298  [recipients, group, administrative, test, ect,...
    8          282  [enron, energy, trade, company, services, pres...
    6          164  [shall, party, agreement, transaction, price, ...
    7          151             [alias, enron, norm, ect, test, group]
    0          114  [enron_development, ect, hou, enron, group, tr...
    2           75  [master, trading, enron, ect, credit, counterp...
    9           61    [rewrite, server, address, smtp, mail, virtual]

    3. Document Clustering (LSI + Ward Hierarchical Clustering)

        .. computed in 37.9s
       N_documents                                      cluster_names
    1          515           [alias, enron, norm, ect, calo, changed]
    2          451  [ect, hou, enron, group, recipients, administr...
    0          409  [enron, energy, ect, company, services, agreem...
    7          321  [teneo, recipients, group, administrative, tes...
    6          231  [recipients, group, administrative, test, ect,...
    4          134       [enron, alias, master, ena, trading, credit]
    8          123             [alias, enron, norm, ect, test, group]
    9          117  [enron_development, ect, hou, enron, group, re...
    3          101  [shall, party, agreement, transaction, confirm...
    5           63    [rewrite, server, address, smtp, mail, virtual]


|


.. code-block:: python


    import pandas as pd
    from freediscovery.text import FeatureVectorizer
    from freediscovery.cluster import Clustering
    from freediscovery.utils import _silent
    from freediscovery.datasets import load_dataset
    from time import time

    pd.options.display.float_format = '{:,.3f}'.format

    dataset_name = "treclegal09_2k_subset"
    cache_dir = '..'


    print("0. Load Dataset")

    ds = load_dataset(dataset_name, cache_dir=cache_dir)


    print("\n1. Feature extraction (non hashed)\n")

    n_features = 30000
    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.preprocess(ds['data_dir'],
                         n_features=n_features, use_hashing=False,
                         use_idf=True, stop_words='english')
    uuid, filenames = fe.transform()


    print("\n2. Document Clustering (LSI + K-Means)\n")

    cat = Clustering(cache_dir=cache_dir, dsid=uuid)

    n_clusters = 10
    n_top_words = 6
    lsi_components = 50


    def repr_clustering(_labels, _terms):
        out = []
        for ridx, row in enumerate(_terms):
            out.append({'cluster_names': row, 'N_documents': (_labels == ridx).sum()})
        out = pd.DataFrame(out).sort_values('N_documents', ascending=False)
        return out


    t0 = time()
    with _silent('stderr'): # ignore some deprecation warnings
        labels, tree  = cat.k_means(n_clusters, lsi_components=lsi_components)
        terms = cat.compute_labels(n_top_words=n_top_words)
    t1 = time()

    print('    .. computed in {:.1f}s'.format(t1 - t0))
    print(repr_clustering(labels, terms))


    print('\n3. Document Clustering (LSI + Ward Hierarchical Clustering)\n')

    t0 = time()
    with _silent('stderr'): # ignore some deprecation warnings
        labels, tree = cat.ward_hc(n_clusters,
                                   lsi_components=lsi_components,
                                   n_neighbors=5   # this is the connectivity constraint
                                   )
        terms = cat.compute_labels(n_top_words=n_top_words)
    t1 = time()

    print('    .. computed in {:.1f}s'.format(t1 - t0))
    print(repr_clustering(labels, terms))

**Total running time of the script:** ( 1 minutes  3.785 seconds)


.. container:: sphx-glr-footer


  .. container:: sphx-glr-download

     :download:`Download Python source code: clustering_example_python.py <clustering_example_python.py>`


  .. container:: sphx-glr-download

     :download:`Download Jupyter notebook: clustering_example_python.ipynb <clustering_example_python.ipynb>`

.. rst-class:: sphx-glr-signature

    `Generated by Sphinx-Gallery <http://sphinx-gallery.readthedocs.io>`_