Clustering Example [Python API]ΒΆ
An example of clustering using the Python API
Out:
0. Load Dataset
Warning: downloading dataset treclegal09_2k_subset (2.8 MB) !
File ../treclegal09_2k_subset.tar.gz downloaded!
Archive extracted!
1. Feature extraction (non hashed)
2. Document Clustering (LSI + K-Means)
.. computed in 20.6s
N_documents cluster_names
3 599 [alias, enron, norm, ect, calo, changed]
5 393 [ect, hou, enron, group, recipients, administr...
4 328 [teneo, recipients, group, administrative, tes...
1 298 [recipients, group, administrative, test, ect,...
8 282 [enron, energy, trade, company, services, pres...
6 164 [shall, party, agreement, transaction, price, ...
7 151 [alias, enron, norm, ect, test, group]
0 114 [enron_development, ect, hou, enron, group, tr...
2 75 [master, trading, enron, ect, credit, counterp...
9 61 [rewrite, server, address, smtp, mail, virtual]
3. Document Clustering (LSI + Ward Hierarchical Clustering)
.. computed in 37.9s
N_documents cluster_names
1 515 [alias, enron, norm, ect, calo, changed]
2 451 [ect, hou, enron, group, recipients, administr...
0 409 [enron, energy, ect, company, services, agreem...
7 321 [teneo, recipients, group, administrative, tes...
6 231 [recipients, group, administrative, test, ect,...
4 134 [enron, alias, master, ena, trading, credit]
8 123 [alias, enron, norm, ect, test, group]
9 117 [enron_development, ect, hou, enron, group, re...
3 101 [shall, party, agreement, transaction, confirm...
5 63 [rewrite, server, address, smtp, mail, virtual]
import pandas as pd
from freediscovery.text import FeatureVectorizer
from freediscovery.cluster import Clustering
from freediscovery.utils import _silent
from freediscovery.datasets import load_dataset
from time import time
pd.options.display.float_format = '{:,.3f}'.format
dataset_name = "treclegal09_2k_subset"
cache_dir = '..'
print("0. Load Dataset")
ds = load_dataset(dataset_name, cache_dir=cache_dir)
print("\n1. Feature extraction (non hashed)\n")
n_features = 30000
fe = FeatureVectorizer(cache_dir=cache_dir)
uuid = fe.preprocess(ds['data_dir'],
n_features=n_features, use_hashing=False,
use_idf=True, stop_words='english')
uuid, filenames = fe.transform()
print("\n2. Document Clustering (LSI + K-Means)\n")
cat = Clustering(cache_dir=cache_dir, dsid=uuid)
n_clusters = 10
n_top_words = 6
lsi_components = 50
def repr_clustering(_labels, _terms):
out = []
for ridx, row in enumerate(_terms):
out.append({'cluster_names': row, 'N_documents': (_labels == ridx).sum()})
out = pd.DataFrame(out).sort_values('N_documents', ascending=False)
return out
t0 = time()
with _silent('stderr'): # ignore some deprecation warnings
labels, tree = cat.k_means(n_clusters, lsi_components=lsi_components)
terms = cat.compute_labels(n_top_words=n_top_words)
t1 = time()
print(' .. computed in {:.1f}s'.format(t1 - t0))
print(repr_clustering(labels, terms))
print('\n3. Document Clustering (LSI + Ward Hierarchical Clustering)\n')
t0 = time()
with _silent('stderr'): # ignore some deprecation warnings
labels, tree = cat.ward_hc(n_clusters,
lsi_components=lsi_components,
n_neighbors=5 # this is the connectivity constraint
)
terms = cat.compute_labels(n_top_words=n_top_words)
t1 = time()
print(' .. computed in {:.1f}s'.format(t1 - t0))
print(repr_clustering(labels, terms))
Total running time of the script: ( 1 minutes 3.785 seconds)