Data Ingestion¶

An example illustrating the data ingestion in FreeDiscovery

from __future__ import print_function

import requests
import pandas as pd
import json
import os.path

pd.options.display.float_format = '{:,.3f}'.format
pd.options.display.expand_frame_repr = False

dataset_name = "treclegal09_2k_subset"     # see list of available datasets

BASE_URL = "http://localhost:5001/api/v0"  # FreeDiscovery server URL

Load the test dataset

url = BASE_URL + '/example-dataset/{}'.format(dataset_name)
print(" GET", url)
input_ds = requests.get(url).json()

# To use a custom dataset, simply specify the following variables
# create a custom dataset definition for ingestion
data_dir = input_ds['metadata']['data_dir']
dataset_definition = [{'document_id': row['document_id'],
                       'file_path': os.path.join(data_dir, row['file_path'])} \
                               for row in input_ds['dataset']]

Out:

GET http://localhost:5001/api/v0/example-dataset/treclegal09_2k_subset

1.a Load dataset and initalize feature extraction

url = BASE_URL + '/feature-extraction'
print(" POST", url)
res = requests.post(url, json={'use_hashing': True}).json()

dsid = res['id']
print("   => received {}".format(list(res.keys())))
print("   => dsid = {}".format(dsid))

print("\n1.b Start feature extraction")

url = BASE_URL+'/feature-extraction/{}'.format(dsid)
print(" POST", url)
res = requests.post(url, json={'dataset_definition': dataset_definition})

Out:

POST http://localhost:5001/api/v0/feature-extraction
   => received ['id']
   => dsid = 546bf74b7c334360

1.b Start feature extraction
 POST http://localhost:5001/api/v0/feature-extraction/546bf74b7c334360

2 check the parameters of the extracted features

url = BASE_URL + '/feature-extraction/{}'.format(dsid)
print(' GET', url)
res = requests.get(url).json()

print('\n'.join(['     - {}: {}'.format(key, val)
      for key, val in res.items() if "filenames" not in key]))

Out:

GET http://localhost:5001/api/v0/feature-extraction/546bf74b7c334360
     - analyzer: word
     - chunk_size: 5000
     - column_ids: None
     - column_separator: ,
     - data_dir: /home/ubuntu/freediscovery_shared/treclegal09_2k_subset/data/jobRun_4/XML_EXPORT_CONTENT/text_9
     - max_df: 1.0
     - min_df: 0.0
     - n_features: 100001
     - n_jobs: 1
     - n_samples: 2465
     - n_samples_processed: 2465
     - ngram_range: [1, 1]
     - norm_alpha: 0.75
     - parse_email_headers: False
     - preprocess: []
     - stop_words: english
     - use_hashing: True
     - weighting: nnc

Examine the id mapping

method = BASE_URL + "/feature-extraction/{}/id-mapping".format(dsid)
print('\n GET', method)
data = {'data': [{'internal_id': row['internal_id']} for row in input_ds['dataset'][:3]]}
print('   DATA', json.dumps(data))
res = requests.post(method, json=data).json()

print(' Response:')
print('  ', json.dumps(res, indent=4))

Out:

GET http://localhost:5001/api/v0/feature-extraction/546bf74b7c334360/id-mapping
   DATA {"data": [{"internal_id": 0}, {"internal_id": 1}, {"internal_id": 2}]}
 Response:
   {
    "data": [
        {
            "document_id": 0,
            "file_path": "0.7.47.1097257.txt",
            "internal_id": 0
        },
        {
            "document_id": 1,
            "file_path": "0.7.47.1097258.txt",
            "internal_id": 1
        },
        {
            "document_id": 4,
            "file_path": "0.7.47.1097259.txt",
            "internal_id": 2
        }
    ]
}

4 Delete the extracted features

url = BASE_URL + '/feature-extraction/{}'.format(dsid)
print(" DELETE", url)
requests.delete(url)

Out:

DELETE http://localhost:5001/api/v0/feature-extraction/546bf74b7c334360

Total running time of the script: ( 0 minutes 1.983 seconds)

Generated by Sphinx-Gallery