Clustering Example [Python API]¶

An example of clustering using the Python API

Out:

Load Dataset

Warning: downloading dataset treclegal09_2k_subset (2.8 MB) !

File ../treclegal09_2k_subset.tar.gz downloaded!
Archive extracted!

Feature extraction (non hashed)


Document Clustering (LSI + K-Means)

    .. computed in 20.6s
   N_documents                                      cluster_names
        599           [alias, enron, norm, ect, calo, changed]
        393  [ect, hou, enron, group, recipients, administr...
        328  [teneo, recipients, group, administrative, tes...
        298  [recipients, group, administrative, test, ect,...
        282  [enron, energy, trade, company, services, pres...
        164  [shall, party, agreement, transaction, price, ...
        151             [alias, enron, norm, ect, test, group]
        114  [enron_development, ect, hou, enron, group, tr...
         75  [master, trading, enron, ect, credit, counterp...
         61    [rewrite, server, address, smtp, mail, virtual]

Document Clustering (LSI + Ward Hierarchical Clustering)

    .. computed in 37.9s
   N_documents                                      cluster_names
        515           [alias, enron, norm, ect, calo, changed]
        451  [ect, hou, enron, group, recipients, administr...
        409  [enron, energy, ect, company, services, agreem...
        321  [teneo, recipients, group, administrative, tes...
        231  [recipients, group, administrative, test, ect,...
        134       [enron, alias, master, ena, trading, credit]
        123             [alias, enron, norm, ect, test, group]
        117  [enron_development, ect, hou, enron, group, re...
        101  [shall, party, agreement, transaction, confirm...
         63    [rewrite, server, address, smtp, mail, virtual]

import pandas as pd
from freediscovery.text import FeatureVectorizer
from freediscovery.cluster import Clustering
from freediscovery.utils import _silent
from freediscovery.datasets import load_dataset
from time import time

pd.options.display.float_format = '{:,.3f}'.format

dataset_name = "treclegal09_2k_subset"
cache_dir = '..'


print("0. Load Dataset")

ds = load_dataset(dataset_name, cache_dir=cache_dir)


print("\n1. Feature extraction (non hashed)\n")

n_features = 30000
fe = FeatureVectorizer(cache_dir=cache_dir)
uuid = fe.preprocess(ds['data_dir'],
                     n_features=n_features, use_hashing=False,
                     use_idf=True, stop_words='english')
uuid, filenames = fe.transform()


print("\n2. Document Clustering (LSI + K-Means)\n")

cat = Clustering(cache_dir=cache_dir, dsid=uuid)

n_clusters = 10
n_top_words = 6
lsi_components = 50


def repr_clustering(_labels, _terms):
    out = []
    for ridx, row in enumerate(_terms):
        out.append({'cluster_names': row, 'N_documents': (_labels == ridx).sum()})
    out = pd.DataFrame(out).sort_values('N_documents', ascending=False)
    return out


t0 = time()
with _silent('stderr'): # ignore some deprecation warnings
    labels, tree  = cat.k_means(n_clusters, lsi_components=lsi_components)
    terms = cat.compute_labels(n_top_words=n_top_words)
t1 = time()

print('    .. computed in {:.1f}s'.format(t1 - t0))
print(repr_clustering(labels, terms))


print('\n3. Document Clustering (LSI + Ward Hierarchical Clustering)\n')

t0 = time()
with _silent('stderr'): # ignore some deprecation warnings
    labels, tree = cat.ward_hc(n_clusters,
                               lsi_components=lsi_components,
                               n_neighbors=5   # this is the connectivity constraint
                               )
    terms = cat.compute_labels(n_top_words=n_top_words)
t1 = time()

print('    .. computed in {:.1f}s'.format(t1 - t0))
print(repr_clustering(labels, terms))

Total running time of the script: ( 1 minutes 3.785 seconds)

Generated by Sphinx-Gallery