In [1]:
import pandas as pd
import numpy as np
import json
from urllib.parse import unquote
import glob
import plotly.express as xp
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("white")
%matplotlib inline

# Initializations - Finction Definitions

In [4]:
#Katrin log path
path = '/mnt/ands/logs/katrin'
all_files = glob.glob(path + "/*.log")
update_request_type = 'SERVICE(update)'
get_request_type = 'SERVICE(get)'
downaload_request_type = 'SERVICE(download)'

#function to filterlog data based on request type
def filter_log(df, request_type):
 logs=df[0].str.split(" ",1)
 logs = pd.DataFrame(logs.values.tolist())
 data = pd.DataFrame(logs.apply(lambda logs: logs[1][:int(logs[0])-2], axis=1).str.split(" ").values.tolist())
 data.columns=["Timestamp", "Severity", "Setup", "Server", "Source", "Target", "SessionID", "Request","Latency", "PID", "ClientID", "Len", "MsgLen"]
 data_filtered = data.loc[data["Source"] == request_type]
 data_filtered["Content"] = logs.iloc[data_filtered.index][1]
 return data_filtered

#function to load query logs of type 'SERVICE(update)'
def process_update_katrin_log(df): 
 data_filtered = filter_log(df, update_request_type)
 
 data_filtered["Message"] = data_filtered.apply(lambda data_filtered: data_filtered["Content"][-int(data_filtered["Len"])+int(data_filtered["MsgLen"])+2: ], axis=1)
 data_filtered["Content"] = data_filtered.apply(lambda data_filtered: data_filtered["Content"][:len(data_filtered["Content"]) -int(data_filtered["Len"])+int(data_filtered["MsgLen"])+1], axis=1)
 data_filtered["db_server"] = data_filtered.apply(lambda data_filtered: json.loads(json.loads(unquote(data_filtered["Message"]), strict=False)['POST']['props'], strict=False)["db_server"] if 'POST' in json.loads(unquote(data_filtered["Message"]), strict=False) else'' , axis=1)
 data_filtered["srctree"] = data_filtered.apply(lambda data_filtered: json.loads(json.loads(unquote(data_filtered["Message"]), strict=False)['POST']['props'], strict=False)["srctree"] if 'POST' in json.loads(unquote(data_filtered["Message"]), strict=False) else'' , axis=1)
 data_filtered["width"] = data_filtered.apply(lambda data_filtered: json.loads(json.loads(unquote(data_filtered["Message"]), strict=False)['POST']['props'], strict=False)["width"] if 'POST' in json.loads(unquote(data_filtered["Message"]), strict=False) else'' , axis=1)
 data_filtered["height"] = data_filtered.apply(lambda data_filtered: json.loads(json.loads(unquote(data_filtered["Message"]), strict=False)['POST']['props'], strict=False)["height"] if 'POST' in json.loads(unquote(data_filtered["Message"]), strict=False) else'' , axis=1)
 data_filtered["db_name"] = data_filtered.apply(lambda data_filtered: json.loads(json.loads(unquote(data_filtered["Message"]), strict=False)['POST']['props'], strict=False)["db_name"] if 'POST' in json.loads(unquote(data_filtered["Message"]), strict=False) else'' , axis=1)
 data_filtered["db_group"] = data_filtered.apply(lambda data_filtered: json.loads(json.loads(unquote(data_filtered["Message"]), strict=False)['POST']['props'], strict=False)["db_group"] if 'POST' in json.loads(unquote(data_filtered["Message"]), strict=False) else'' , axis=1)
 data_filtered["control_group"] = data_filtered.apply(lambda data_filtered: json.loads(json.loads(unquote(data_filtered["Message"]), strict=False)['POST']['props'], strict=False)["control_group"] if 'POST' in json.loads(unquote(data_filtered["Message"]), strict=False) else'' , axis=1)
 data_filtered["db_mask"] = data_filtered.apply(lambda data_filtered: json.loads(json.loads(unquote(data_filtered["Message"]), strict=False)['POST']['props'], strict=False)["db_mask"] if 'POST' in json.loads(unquote(data_filtered["Message"]), strict=False) else'' , axis=1)
 data_filtered["window"] = data_filtered.apply(lambda data_filtered: json.loads(json.loads(unquote(data_filtered["Message"]), strict=False)['POST']['props'], strict=False)["window"] if 'POST' in json.loads(unquote(data_filtered["Message"]), strict=False) else'' , axis=1)
 data_filtered["aggregation"] = data_filtered.apply(lambda data_filtered: json.loads(json.loads(unquote(data_filtered["Message"]), strict=False)['POST']['props'], strict=False)["aggregation"] if 'POST' in json.loads(unquote(data_filtered["Message"]), strict=False) else'' , axis=1)
 data_filtered["resample"] = data_filtered.apply(lambda data_filtered: json.loads(json.loads(unquote(data_filtered["Message"]), strict=False)['POST']['props'], strict=False)["resample"] if 'POST' in json.loads(unquote(data_filtered["Message"]), strict=False) else'' , axis=1)
 data_filtered["mask_mode"] = data_filtered.apply(lambda data_filtered: json.loads(json.loads(unquote(data_filtered["Message"]), strict=False)['POST']['props'], strict=False)["mask_mode"] if 'POST' in json.loads(unquote(data_filtered["Message"]), strict=False) else'' , axis=1)
 data_filtered.drop(['Message','MsgLen', 'Len'], axis=1, inplace = True) 
 return data_filtered

#function to load query logs of type 'SERVICE(get)'
def process_get_katrin_log(df) :
 data_filtered = filter_log(df, get_request_type)
 data_filtered["Content"] = logs.iloc[data_filtered.index][1]
 data_filtered["Message"] = data_filtered.apply(lambda data_filtered: data_filtered["Content"][-int(data_filtered["Len"])+int(data_filtered["MsgLen"])+2: ], axis=1)
 data_filtered["rt"] = data_filtered.apply(lambda data_filtered: json.loads(unquote(data_filtered["Message"]), strict=False)['GET'].get("rt") , axis=1)
 data_filtered["db_name"] = data_filtered.apply(lambda data_filtered: json.loads(unquote(data_filtered["Message"]), strict=False)['GET'].get("db_name") , axis=1)
 data_filtered["db_group"] = data_filtered.apply(lambda data_filtered: json.loads(unquote(data_filtered["Message"]), strict=False)['GET'].get("db_group") , axis=1)
 data_filtered["srctree"] = data_filtered.apply(lambda data_filtered: json.loads(unquote(data_filtered["Message"]), strict=False)['GET'].get("srctree") , axis=1)
 data_filtered["db_mask"] = data_filtered.apply(lambda data_filtered: json.loads(unquote(data_filtered["Message"]), strict=False)['GET'].get("db_mask") , axis=1)
 data_filtered["window"] = data_filtered.apply(lambda data_filtered: json.loads(unquote(data_filtered["Message"]), strict=False)['GET'].get("window") , axis=1)
 data_filtered["resample"] = data_filtered.apply(lambda data_filtered: json.loads(unquote(data_filtered["Message"]), strict=False)['GET'].get("resample") , axis=1)
 data_filtered.drop(['MsgLen', 'Severity', 'Len','Target', 'Content', 'Message'], axis=1, inplace = True)
 return data_filtered

#function to load query logs of type 'SERVICE(download)'
def process_download_katrin_log(df) :
 data_filtered = filter_log(df, downaload_request_type)
 data_filtered["Content"] = logs.iloc[data_filtered.index][1]
 data_filtered = data_filtered.loc[~data_filtered['Content'].isnull()]
 if len(data_filtered) >= 1:
 data_filtered["Message"] = data_filtered.apply(lambda data_filtered: data_filtered["Content"][-int(data_filtered["Len"])+int(data_filtered["MsgLen"])+2: ], axis=1)
 data_filtered["Target"] = data_filtered.apply(lambda data_filtered: json.loads(unquote(data_filtered["Message"]), strict=False)['GET'].get("target") , axis=1)
 data_filtered = data_filtered.loc[data_filtered['Target'] == 'dlmanager_add']
 data_filtered.drop(['Content', 'MsgLen', 'Len'], axis=1, inplace = True)
 return data_filtered

#function to chunk a log file and process each chunck individually
def process_chunks(filename, request_type):
 chunksize = 10 ** 6
 i=0
 with pd.read_csv(filename, chunksize=chunksize, sep=r'\t', engine='python', header = None) as reader:
 for chunk in reader: 
 if (request_type == update_request_type):
 processed_df = process_update_katrin_log(chunk)
 elif (request_type == get_request_type):
 processed_df = process_get_katrin_log(chunk)
 elif (request_type == get_request_type):
 processed_df = process_download_katrin_log(chunk)
 if i == 0: 

 df = processed_df
 else:
 df = df.append(processed_df, ignore_index=True) 
 i+=1
 return df


def process_all_files(request_type):
 i = 0
 for filename in all_files:
 if i == 0:
 df = process_chunks(filename, update_request_type)
 else:
 df = df.append(process_chunks(filename, update_request_type), ignore_index=True) 
 i+=1
 return df

# Loading KATRIN Logs


## Loading service(update) logs

In [None]:
processed_update_logs = process_all_files(update_request_type)
processed_update_logs.to_csv("katrin_update.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
 data_filtered["Content"] = logs.iloc[data_filtered.index][1]


## Loading service(get) logs

In [None]:
processed_get_logs = process_all_files(get_request_type)
processed_get_logs.to_csv("katrin_get.csv")

## Loading service(download) logs

In [None]:
processed_download_logs = process_all_files(download_request_type)
processed_download_logs.to_csv("katrin_download.csv")