Clustering Example [Python API]ΒΆ

An example of clustering using the Python API

Out:

0. Load Dataset

Warning: downloading dataset treclegal09_2k_subset (2.8 MB) !

File ../treclegal09_2k_subset.tar.gz downloaded!
Archive extracted!

1. Feature extraction (non hashed)


2. Document Clustering (LSI + K-Means)

    .. computed in 20.6s
   N_documents                                      cluster_names
3          599           [alias, enron, norm, ect, calo, changed]
5          393  [ect, hou, enron, group, recipients, administr...
4          328  [teneo, recipients, group, administrative, tes...
1          298  [recipients, group, administrative, test, ect,...
8          282  [enron, energy, trade, company, services, pres...
6          164  [shall, party, agreement, transaction, price, ...
7          151             [alias, enron, norm, ect, test, group]
0          114  [enron_development, ect, hou, enron, group, tr...
2           75  [master, trading, enron, ect, credit, counterp...
9           61    [rewrite, server, address, smtp, mail, virtual]

3. Document Clustering (LSI + Ward Hierarchical Clustering)

    .. computed in 37.9s
   N_documents                                      cluster_names
1          515           [alias, enron, norm, ect, calo, changed]
2          451  [ect, hou, enron, group, recipients, administr...
0          409  [enron, energy, ect, company, services, agreem...
7          321  [teneo, recipients, group, administrative, tes...
6          231  [recipients, group, administrative, test, ect,...
4          134       [enron, alias, master, ena, trading, credit]
8          123             [alias, enron, norm, ect, test, group]
9          117  [enron_development, ect, hou, enron, group, re...
3          101  [shall, party, agreement, transaction, confirm...
5           63    [rewrite, server, address, smtp, mail, virtual]

import pandas as pd
from freediscovery.text import FeatureVectorizer
from freediscovery.cluster import Clustering
from freediscovery.utils import _silent
from freediscovery.datasets import load_dataset
from time import time

pd.options.display.float_format = '{:,.3f}'.format

dataset_name = "treclegal09_2k_subset"
cache_dir = '..'


print("0. Load Dataset")

ds = load_dataset(dataset_name, cache_dir=cache_dir)


print("\n1. Feature extraction (non hashed)\n")

n_features = 30000
fe = FeatureVectorizer(cache_dir=cache_dir)
uuid = fe.preprocess(ds['data_dir'],
                     n_features=n_features, use_hashing=False,
                     use_idf=True, stop_words='english')
uuid, filenames = fe.transform()


print("\n2. Document Clustering (LSI + K-Means)\n")

cat = Clustering(cache_dir=cache_dir, dsid=uuid)

n_clusters = 10
n_top_words = 6
lsi_components = 50


def repr_clustering(_labels, _terms):
    out = []
    for ridx, row in enumerate(_terms):
        out.append({'cluster_names': row, 'N_documents': (_labels == ridx).sum()})
    out = pd.DataFrame(out).sort_values('N_documents', ascending=False)
    return out


t0 = time()
with _silent('stderr'): # ignore some deprecation warnings
    labels, tree  = cat.k_means(n_clusters, lsi_components=lsi_components)
    terms = cat.compute_labels(n_top_words=n_top_words)
t1 = time()

print('    .. computed in {:.1f}s'.format(t1 - t0))
print(repr_clustering(labels, terms))


print('\n3. Document Clustering (LSI + Ward Hierarchical Clustering)\n')

t0 = time()
with _silent('stderr'): # ignore some deprecation warnings
    labels, tree = cat.ward_hc(n_clusters,
                               lsi_components=lsi_components,
                               n_neighbors=5   # this is the connectivity constraint
                               )
    terms = cat.compute_labels(n_top_words=n_top_words)
t1 = time()

print('    .. computed in {:.1f}s'.format(t1 - t0))
print(repr_clustering(labels, terms))

Total running time of the script: ( 1 minutes 3.785 seconds)

Generated by Sphinx-Gallery