Data IngestionΒΆ
An example illustrating the data ingestion in FreeDiscovery
from __future__ import print_function
import requests
import pandas as pd
import json
import os.path
pd.options.display.float_format = '{:,.3f}'.format
pd.options.display.expand_frame_repr = False
dataset_name = "treclegal09_2k_subset" # see list of available datasets
BASE_URL = "http://localhost:5001/api/v0" # FreeDiscovery server URL
- Load the test dataset
url = BASE_URL + '/example-dataset/{}'.format(dataset_name)
print(" GET", url)
input_ds = requests.get(url).json()
# To use a custom dataset, simply specify the following variables
# create a custom dataset definition for ingestion
data_dir = input_ds['metadata']['data_dir']
dataset_definition = [{'document_id': row['document_id'],
'file_path': os.path.join(data_dir, row['file_path'])} \
for row in input_ds['dataset']]
Out:
GET http://localhost:5001/api/v0/example-dataset/treclegal09_2k_subset
1.a Load dataset and initalize feature extraction
url = BASE_URL + '/feature-extraction'
print(" POST", url)
res = requests.post(url, json={'use_hashing': True}).json()
dsid = res['id']
print(" => received {}".format(list(res.keys())))
print(" => dsid = {}".format(dsid))
print("\n1.b Start feature extraction")
url = BASE_URL+'/feature-extraction/{}'.format(dsid)
print(" POST", url)
res = requests.post(url, json={'dataset_definition': dataset_definition})
Out:
POST http://localhost:5001/api/v0/feature-extraction
=> received ['id']
=> dsid = 546bf74b7c334360
1.b Start feature extraction
POST http://localhost:5001/api/v0/feature-extraction/546bf74b7c334360
2 check the parameters of the extracted features
url = BASE_URL + '/feature-extraction/{}'.format(dsid)
print(' GET', url)
res = requests.get(url).json()
print('\n'.join([' - {}: {}'.format(key, val)
for key, val in res.items() if "filenames" not in key]))
Out:
GET http://localhost:5001/api/v0/feature-extraction/546bf74b7c334360
- analyzer: word
- chunk_size: 5000
- column_ids: None
- column_separator: ,
- data_dir: /home/ubuntu/freediscovery_shared/treclegal09_2k_subset/data/jobRun_4/XML_EXPORT_CONTENT/text_9
- max_df: 1.0
- min_df: 0.0
- n_features: 100001
- n_jobs: 1
- n_samples: 2465
- n_samples_processed: 2465
- ngram_range: [1, 1]
- norm_alpha: 0.75
- parse_email_headers: False
- preprocess: []
- stop_words: english
- use_hashing: True
- weighting: nnc
- Examine the id mapping
method = BASE_URL + "/feature-extraction/{}/id-mapping".format(dsid)
print('\n GET', method)
data = {'data': [{'internal_id': row['internal_id']} for row in input_ds['dataset'][:3]]}
print(' DATA', json.dumps(data))
res = requests.post(method, json=data).json()
print(' Response:')
print(' ', json.dumps(res, indent=4))
Out:
GET http://localhost:5001/api/v0/feature-extraction/546bf74b7c334360/id-mapping
DATA {"data": [{"internal_id": 0}, {"internal_id": 1}, {"internal_id": 2}]}
Response:
{
"data": [
{
"document_id": 0,
"file_path": "0.7.47.1097257.txt",
"internal_id": 0
},
{
"document_id": 1,
"file_path": "0.7.47.1097258.txt",
"internal_id": 1
},
{
"document_id": 4,
"file_path": "0.7.47.1097259.txt",
"internal_id": 2
}
]
}
4 Delete the extracted features
url = BASE_URL + '/feature-extraction/{}'.format(dsid)
print(" DELETE", url)
requests.delete(url)
Out:
DELETE http://localhost:5001/api/v0/feature-extraction/546bf74b7c334360
Total running time of the script: ( 0 minutes 1.983 seconds)