Log retrieval and storage via python + AWS

This example demonstrates how to retrieve your client audit logs from the CAL endpoint and push them to your AWS instance.

''' The script demonstrates how to get a token and retrieve files for download from the CAL endpoint. '''
  
#!/usr/bin/env python
  
import sys
import hashlib
import tempfile
import boto3
import requests
from botocore.errorfactory import ClientError
  
def get_token(secret):
    ''' Request access token from Keycloak via Client Secret '''
  
    headers = {'Authorization': 'Basic ' + secret}
    resp = requests.post('https://identity.indexexchange.com/auth/realms/eventlog/protocol/openid-connect/token', headers=headers, data={'grant_type': 'client_credentials'})
    resp.raise_for_status()
  
    resp_json = resp.json()
    return resp_json["access_token"]
  
def get_available_downloads(token):
    ''' Given a header containing an access token, retrieve list of available downloads '''
  
    token_header = {'Authorization': 'Bearer ' + token}
  
    resp = requests.get('https://app.indexexchange.com/api/cal/v1/downloads', headers=token_header)
    resp.raise_for_status()
  
    return resp.json()
  
def upload_file_to_s3(s3_client, token, bucket, bucket_key, url, expected_md5sum):
    ''' Download a file from CAL and upload it to S3 client '''
  
    token_header = {'Authorization': 'Bearer ' + token}
  
    resp = requests.get(url, headers=token_header, timeout=5, stream=True)
    resp.raise_for_status()
  
    # download CAL file to disk in chunks so we don't hold huge files in memory
    with tempfile.NamedTemporaryFile() as tmp:
        md5sum = hashlib.md5()
        for chunk in resp.iter_content(chunk_size=32000):
            if chunk:
                tmp.write(chunk)
                md5sum.update(chunk)
  
        if md5sum.hexdigest() != expected_md5sum:
            raise Exception('m5dsum of downloaded file does not match expected value. Actual: {}, Expected: {}'.format(md5sum.hexdigest(), expected_md5sum))
  
        # Save metadata to be used when re-downloading, so we can skip files that are unchanged (have the same md5sum)
        s3_client.upload_file(tmp.name, bucket, bucket_key, ExtraArgs={'Metadata': {'md5sum': md5sum.hexdigest()}})
  
        print "Successfully uploaded file {}".format(bucket_key)
  
def is_file_uploaded(s3_client, bucket, bucket_key, md5sum):
    ''' Check if the key is already in the bucket and the contents of key are unchanged '''
  
    try:
        key_object = s3_client.head_object(Bucket=bucket, Key=bucket_key)
  
        # Check if file exists and is unchanged via md5sum
        return 'md5sum' in key_object['Metadata'] and key_object['Metadata']['md5sum'] == md5sum
    except ClientError:
        # key_object not found in bucket
        return False
  
def main():
    ''' Standard process for retrieving CAL files: Retrieve auth token, get list of available files, download files and upload to S3'''
  
    if len(sys.argv) != 4:
        print 'Usage: python get_event_logs.py <client_secret> <aws_access_key> <aws_secret_key>'
        quit()
  
    token = get_token(sys.argv[1])
    downloads = get_available_downloads(token)
  
    print 'Number of available downloads: {}'.format(downloads['count'])
  
    # Connect to S3 Client via access key and secret key
    client = boto3.client(
        's3',
        aws_access_key_id=sys.argv[2],
        aws_secret_access_key=sys.argv[3],
    )
  
    # Download all available files and push them to an S3 bucket
    for download in downloads['availableDownloads']:
        # Destination file named after the hour
        bucket_key = '{}.gz'.format(download['hour'])
        bucket = 'insert-your-bucket-name-here'
  
        if not is_file_uploaded(client, bucket, bucket_key, download['md5sum']):
            upload_file_to_s3(client, token, bucket, bucket_key, download['downloadURL'], download['md5sum'])
  
if __name__ == "__main__":
    main()