{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import json\n",
"import plotly.express as xp\n",
"import seaborn as sns\n",
"sns.set_style(\"white\")\n",
"import matplotlib.pyplot as plt\n",
"from plotly.colors import *"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Initializations and Functions"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"#Internal functions \n",
"\n",
"#function to calculate the number of distinct groups (tables) used in a query given the value of srctree\n",
"def get_len(data):\n",
" l = 1\n",
" ar = []\n",
" #split the value of srctree by comma to retrieve the different data sources (server__table__column)\n",
" lst = data.split(',')\n",
" for s in lst:\n",
" #for each data source, retrieve the name of the table\n",
" if len(s.split('__')) >=3:\n",
" ar.append(s.split('__')[2])\n",
" if len(set(ar)) >=0:\n",
" #get the number of distinct table names\n",
" l = len(set(ar))\n",
" return l\n",
"\n",
"\n",
"#function to return the distinct names of servers per row\n",
"def get_servers(data):\n",
" servers = []\n",
" #split the value of srctree by comma to retrieve the different data sources (server__table__column)\n",
" lst = data.split(',')\n",
" for s in lst:\n",
" #for each data source, retrieve the name of the server\n",
" if len(s.split('__')) >=1:\n",
" servers.append(s.split('__')[0])\n",
" return set(servers)\n",
"\n",
"#function to return the number of servers per row\n",
"def get_server_number(data):\n",
" servers = []\n",
" #split the value of srctree by comma to retrieve the different data sources (server__table__column)\n",
" lst = data.split(',')\n",
" for s in lst:\n",
" #for each data source, retrieve the name of the server\n",
" if len(s.split('__')) >=1:\n",
" servers.append(s.split('__')[0])\n",
" return len(set(servers))\n",
"\n",
"#function to calculate the number of columns from srctree \n",
"def get_col_nb(data):\n",
" l = 1\n",
" ar = []\n",
" lst = data.split(',')\n",
" if len(lst) >= 0:\n",
" l = len(lst)\n",
" return str(l)\n",
"\n",
"#function to calculate the number of columns from db_mask\n",
"def get_col_nb_novirtual(data):\n",
" if pd.isnull(data):\n",
" return 'All'\n",
" try: \n",
" float(data)\n",
" return '1'\n",
" except ValueError: \n",
" s = data.split(',')\n",
" l = len(s) \n",
" return str(l)\n",
"\n",
"#function to plot histogram with percentage\n",
"def plot_his_percentage(data, group_col, title, xtitle, ytitle):\n",
" data_grouped = data[[group_col]].groupby([group_col]).size().reset_index(name='count')\n",
" fig = xp.histogram(data_grouped.sort_values(by=['count'], ascending=False),histnorm = 'percent',x = group_col, y = 'count', title=title, width=600, height=400 ,color_discrete_sequence=['#75BDA7']).update_layout(xaxis_title = xtitle, yaxis_title=ytitle) \n",
"\n",
" fig.update_layout(yaxis={\"ticksuffix\":\"%\"})\n",
" fig.show()\n",
" \n",
"\n",
"# function to plot 100% stacked bar chart representing the percentage of a certan category\n",
"def plot_stacked_chart(data, col, title):\n",
" data_grouped=data[['Setup',col]].groupby(['Setup',col]).size().reset_index(name='count')\n",
" data_grouped['Total'] = data_grouped['count'].sum()\n",
" data_grouped['Percentage']=data_grouped['count']/data_grouped['Total']\n",
" data_grouped['PercentageLabel']=data_grouped['Percentage'].transform(lambda x: '{:,.0%}'.format(x)) \n",
" data_grouped['Label']= data_grouped[['PercentageLabel',col]].apply(lambda x : '{}: {}'.format(x[1],x[0]), axis=1)\n",
" fig=xp.bar(data_grouped, x='Setup', y='Percentage', title=title, color=col, barmode ='stack', width=340, height=500, text = \"Label\")\n",
" fig.update_layout(showlegend=False)\n",
" fig.update_yaxes(showticklabels=False, visible=False)\n",
" fig.update_xaxes(showticklabels=False, visible=False)\n",
" fig.show()\n",
" \n",
"\n",
"#function to plot scattered bar chart showing the relation between 2 numerical columns\n",
"def plot_scattered_chart(data, xcol, ycol, xcol_label, ycol_label, title):\n",
" fig = xp.scatter(data, y=ycol, x=xcol, width=600, height=400, title=title,render_mode='webgl',\n",
"\n",
" labels={\n",
" xcol: xcol_label,\n",
" ycol: ycol_label\n",
" }\n",
" )\n",
" fig.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Processing"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"D:\\Programs\\anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3049: DtypeWarning:\n",
"\n",
"Columns (19,20) have mixed types.Specify dtype option on import or set low_memory=False.\n",
"\n"
]
}
],
"source": [
"#Datasets are loaded from the csv consolidated datasets that are generated in the preparation phase\n",
"#load KATRIN update requests\n",
"katrin_update_data = pd.read_csv('./Data/KATRIN_Update_WithMessage.csv')\n",
"#load KATRIN getdata requests\n",
"katrin_read_data = pd.read_csv('./Data/KATRIN_get_no_rt.csv')\n",
"#load KATRIN donwload requests\n",
"katrin_download_data = pd.read_csv('./Data/Katrin_download.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Merge Datasets"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"#define common columns\n",
"common_columns = ['Timestamp', 'Setup', 'db_server', 'db_mask', 'Source', 'Request', 'PID', 'ClientID', 'db_name', 'db_group', 'srctree', 'window', 'resample', 'Latency', 'width', 'height', 'SessionID']\n",
"#initialize width and height data for the download and Get datasets as they dont exist\n",
"katrin_download_data['width'] = None\n",
"katrin_read_data['width'] = None\n",
"katrin_download_data['height'] = None\n",
"katrin_read_data['height'] = None\n",
"katrin_read_data['db_server'] = katrin_read_data['Server']"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"#Merge the datasets\n",
"katrin_update_data_c = katrin_update_data[common_columns]\n",
"katrin_read_data_c = katrin_read_data[common_columns]\n",
"katrin_download_data_c = katrin_download_data[common_columns]\n",
"data_c = katrin_update_data_c\n",
"data_c = data_c.append(katrin_read_data_c, ignore_index=True)\n",
"data_c = data_c.append(katrin_download_data_c, ignore_index=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"#remove duplicate requests - request with same request Id\n",
"from datetime import datetime\n",
"data_c['Timestamp'] = pd.to_datetime(data_c['Timestamp'])\n",
"data_grouped_request = data_c.groupby([ 'Setup', 'db_server', 'db_mask', 'Source', 'Request', 'PID', 'ClientID', 'db_name', 'db_group', 'srctree', 'window', 'resample','width', 'height', 'SessionID'], dropna=False).agg({'Timestamp' : ['count',np.max], 'Latency':[np.max]}).reset_index()\n",
"data_grouped_request.columns = [ 'Setup', 'db_server', 'db_mask', 'Source', 'Request', 'PID', 'ClientID', 'db_name', 'db_group', 'srctree', 'window', 'resample','width', 'height', 'SessionID', 'RequestCount', 'Timestamp', 'Latency']\n",
"data_c = data_grouped_request"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Calculate Data Recency, Duration (Time Span) and Records Limit"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Case when window = '0'"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"data_c.loc[data_c[\"window\"] == '0',\"period\"] = 'All'\n",
"data_c.loc[data_c[\"window\"] == '0',\"data_age\"] = 'Begining'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Case when window starts with - : Get data from the begining of the experiment"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"data_c.loc[data_c['window'].str.contains('^-', regex=True, na=False),\"period\"] = 'All'\n",
"data_c.loc[~data_c['window'].str.contains('^-', regex=True, na=False),\"data_age\"] = 'Begining'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Initialize start and end timespans\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"#split window by '-' to retrieve the start and end timespans\n",
"data_c[['window_start','window_end']] = data_c['window'].str.split('-',expand=True)\n",
"#remove data with empty window\n",
"data_c = data_c.loc[~data_c['window'].isnull()]\n",
"#clean start and end timespans\n",
"data_c['window_start_clean'] = data_c['window_start'].str.split('.',expand=True)[0]\n",
"data_c['window_end_clean'] = data_c['window_end'].str.split('.',expand=True)[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Extract Query Limit \n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"data_c[['window_end_clean', 'limit']] = data_c['window_end_clean'].str.split(',',expand=True)\n",
"data_c.loc[data_c['limit'] == 'undefined:undefined', 'limit'] = None\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.000255151508966024\n",
"0.0\n",
"1.9160602565803573\n"
]
}
],
"source": [
"# % of queries with data limit\n",
"print(len(data_c[~data_c['limit'].isnull()])/len(data_c)*100)\n",
"# % of queries that return the last value\n",
"print(len(data_c[data_c['window'] == '-1'])/len(data_c)*100)\n",
"# % of queries that return all the records\n",
"print(len(data_c[data_c['window'] == '0'])/len(data_c)*100)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Clean start and end timestamps\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"#Get timespan in second\n",
"data_c['timestamp_sec'] = data_c.apply(lambda data: datetime.timestamp(data['Timestamp']) , axis = 1)\n",
"#clean window start and window end values\n",
"data_c.loc[data_c['window_start_clean'] != '','window_start_clean'] = data_c.loc[data_c['window_start_clean'] != '','window_start_clean'].astype('int')\n",
"data_c['window_start_clean'] =data_c['window_start_clean'].replace(r'^\\s*$', np.nan, regex=True)\n",
"data_c['window_end_clean'] =data_c['window_end_clean'].replace(r'^\\s*$', np.nan, regex=True)\n",
"data_c.loc[(~data_c['window_end_clean'].isnull()) & (data_c[\"window_end_clean\"].astype('float') > data_c[\"timestamp_sec\"]),'window_end_clean'] = data_c.loc[(~data_c['window_end_clean'].isnull()) & (data_c[\"window_end_clean\"].astype('float') > data_c[\"timestamp_sec\"]),'timestamp_sec']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
" #### Calculate duration in case window = number"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"data_c.loc[(~data_c['window_start_clean'].isnull() )& (data_c['window_start_clean'].astype('Int64')<= 31536000), 'duration_sec' ] = data_c.loc[(~data_c['window_start_clean'].isnull() )& (data_c['window_start_clean'].astype('Int64')<= 31536000), 'window_start_clean' ]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Calculate duration for window with start and no end\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"data_c.loc[(data_c['window_start_clean'] > 31536000) & (data_c['window_end_clean'].isnull()),'duration_sec'] = data_c.loc[(data_c['window_start_clean'] > 31536000) & (data_c['window_end_clean'].isnull()),'timestamp_sec'] - data_c.loc[(data_c['window_start_clean'] > 31536000) & (data_c['window_end_clean'].isnull()),'window_start_clean']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Calculate duration for window with start and end"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"data_c.loc[(data_c['window_start_clean'] > 31536000) & (~data_c['window_end_clean'].isnull() ),'duration_sec'] = data_c.loc[(data_c['window_start_clean'] > 31536000) & (~data_c['window_end_clean'].isnull() ),'window_end_clean'].astype('int') - data_c.loc[(data_c['window_start_clean'] > 31536000) & (~data_c['window_end_clean'].isnull() ),'window_start_clean'].astype('int')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"data_c['duration_min'] = data_c['duration_sec']/60\n",
"data_c['duration_hour'] = data_c['duration_sec']/3600\n",
"data_c['duration_day'] = data_c['duration_hour']/24"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Calculate Recency\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"#extract data age for requests with window = duration (no separator)\n",
"data_c.loc[data_c['window_start_clean'] <= 31536000,'query_from'] = data_c.loc[data_c['window_start_clean'] <= 31536000,'duration_sec'] \n",
"#extract data age for requests with window start and no window end \n",
"data_c.loc[(data_c['window_start_clean'] > 31536000) & (data_c['window_end_clean'].isnull()),'query_from'] = data_c.loc[(data_c['window_start_clean'] > 31536000) & (data_c['window_end_clean'].isnull()),'duration_sec']\n",
"#extract data age for requests with window start and window end\n",
"data_c.loc[(data_c['window_start_clean'] > 31536000) & (~data_c['window_end_clean'].isnull()),'query_from'] = data_c.loc[(data_c['window_start_clean'] > 31536000) & (~data_c['window_end_clean'].isnull()),'timestamp_sec'].astype(float) - data_c.loc[(data_c['window_start_clean'] > 31536000) & (~data_c['window_end_clean'].isnull()),'window_start_clean']\n",
"data_c['data_age_sec'] = data_c['query_from']\n",
"data_c['data_age_min'] = data_c['data_age_sec']/60\n",
"data_c['data_age_hour'] = data_c['data_age_sec']/3600\n",
"data_c['data_age_day'] = data_c['data_age_hour']/24\n",
"\n",
"data_c = data_c.loc[(data_c['data_age_day'] >=0) | (data_c['data_age_day'].isnull()) ]\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Calculate Recency and Duration Categories\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"data_c.loc[(data_c[\"data_age_sec\"] <= 43200) & (data_c[\"data_age_sec\"] >0 ) ,\"data_age_cat\"] = '0-12hr'\n",
"# data_c.loc[(data_c[\"data_age_sec\"] > 300) &(data_c[\"data_age_sec\"] <= 900) ,\"data_age_cat\"] = '5mn-15mn'\n",
"\n",
"# data_c.loc[(data_c[\"data_age_sec\"] > 900) &(data_c[\"data_age_sec\"] <= 3600) ,\"data_age_cat\"] = '15mn-1hr'\n",
"# data_c.loc[(data_c[\"data_age_sec\"] > 3600) &(data_c[\"data_age_sec\"] <= 10800.0) ,\"data_age_cat\"] = '1hr-3hr'\n",
"# data_c.loc[(data_c[\"data_age_sec\"] > 10800) &(data_c[\"data_age_sec\"] <= 21600) ,\"data_age_cat\"] = '3hr-6hr'\n",
"# data_c.loc[(data_c[\"data_age_sec\"] > 21600) &(data_c[\"data_age_sec\"] <= 43200) ,\"data_age_cat\"] = '6hr-12hr'\n",
"data_c.loc[(data_c[\"data_age_sec\"] > 43200) &(data_c[\"data_age_sec\"] <= 86400) ,\"data_age_cat\"] = '12hr-1day'\n",
"\n",
"# data_c.loc[(data_c[\"data_age_hour\"] > 24)& (data_c[\"data_age_hour\"] <=48) ,\"data_age_cat\"] = '1-2days'\n",
"data_c.loc[(data_c[\"data_age_day\"] >1 )& (data_c[\"data_age_day\"] <=7) ,\"data_age_cat\"] = '1week'\n",
"data_c.loc[(data_c[\"data_age_day\"] >7)& (data_c[\"data_age_day\"] <= 30) ,\"data_age_cat\"] = '1 month'\n",
"data_c.loc[(data_c[\"data_age_day\"]>30)& (data_c[\"data_age_day\"] <= 90) ,\"data_age_cat\"] = '3 months'\n",
"data_c.loc[(data_c[\"data_age_day\"]>90)& (data_c[\"data_age_day\"] <= 365) ,\"data_age_cat\"] = '1 year'\n",
"data_c.loc[(data_c[\"data_age_day\"]>365)& (data_c[\"data_age_day\"] <= 730) ,\"data_age_cat\"] = '2 years'\n",
"data_c.loc[data_c[\"data_age_day\"]>730 ,\"data_age_cat\"] = '> 2 years'\n",
"\n",
"data_c.loc[data_c[\"window\"] == '-1' ,\"data_age_cat\"] = '<=5 mn'\n",
"data_c.loc[data_c[\"window\"] == '0' ,\"data_age_cat\"] = 'Beginning'\n",
"data_c.loc[(data_c['window'].str.contains('^-', regex=True, na=False)) & (data_c['window'] != '-1'), 'data_age_cat'] = 'Beginning'\n",
"\n",
"\n",
"\n",
"data_c.loc[(data_c[\"duration_sec\"] <= 43200) & (data_c[\"duration_sec\"] >0 ) ,\"duration_cat\"] = '0-12 hr'\n",
"# data_c.loc[(data_c[\"duration_sec\"] > 300) &(data_c[\"duration_sec\"] <= 900) ,\"duration_cat\"] = '5mn-15mn'\n",
"\n",
"# data_c.loc[(data_c[\"duration_sec\"] > 900) &(data_c[\"duration_sec\"] <= 3600) ,\"duration_cat\"] = '15mn-1hr'\n",
"# data_c.loc[(data_c[\"duration_sec\"] > 3600) &(data_c[\"duration_sec\"] <= 10800.0) ,\"duration_cat\"] = '1hr-3hr'\n",
"# data_c.loc[(data_c[\"duration_sec\"] > 10800) &(data_c[\"duration_sec\"] <= 21600) ,\"duration_cat\"] = '3hr-6hr'\n",
"# data_c.loc[(data_c[\"duration_sec\"] > 21600) &(data_c[\"duration_sec\"] <= 43200) ,\"duration_cat\"] = '6hr-12hr'\n",
"data_c.loc[(data_c[\"duration_sec\"] > 43200) &(data_c[\"duration_sec\"] <= 86400) ,\"duration_cat\"] = '12hr-1day'\n",
"# data_c.loc[(data_c[\"duration_sec\"] > 86400) &(data_c[\"duration_sec\"] <= 172800) ,\"duration_cat\"] = '1-2days'\n",
"\n",
"\n",
"\n",
"data_c.loc[(data_c[\"duration_day\"] > 1)& (data_c[\"duration_day\"] <=7) ,\"duration_cat\"] = '1week'\n",
"data_c.loc[(data_c[\"duration_day\"] >7)& (data_c[\"duration_day\"] <= 30) ,\"duration_cat\"] = '1w-1month'\n",
"data_c.loc[(data_c[\"duration_day\"]>30)& (data_c[\"duration_day\"] <= 90) ,\"duration_cat\"] = '1-3months'\n",
"data_c.loc[(data_c[\"duration_day\"]>90)& (data_c[\"duration_day\"] <= 365) ,\"duration_cat\"] = '3m-1year'\n",
"data_c.loc[(data_c[\"duration_day\"]>365)& (data_c[\"duration_day\"] <= 730) ,\"duration_cat\"] = '1-2 years'\n",
"data_c.loc[data_c[\"duration_day\"]>730 ,\"duration_cat\"] = '> 2 years'\n",
"\n",
"data_c.loc[(data_c['window'].str.contains('^-', regex=True, na=False)) & (data_c['window'] != '-1'), 'duration_cat'] = 'All'\n",
"data_c.loc[(data_c['window'] == '0'), 'duration_cat'] = 'All'\n",
"data_c.loc[(data_c['window'] == '-1'), 'duration_cat'] = 'Last Value'\n",
"\n",
"data_c = data_c.loc[data_c['duration_hour'] > 0]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"data_c.loc[data_c['duration_day'] <=1 ,\"DurationType\"] = \"Short Duration\" \n",
"data_c.loc[data_c['duration_day'] > 1,\"DurationType\"] = \"Long Duration\"\n",
"\n",
"data_c.loc[data_c['data_age_day'] <=1 ,\"RecencyType\"] = \"Recent\" \n",
"data_c.loc[data_c['data_age_day'] > 1,\"RecencyType\"] = \"Old\""
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"84.81237902974047\n",
"73.6760910736956\n"
]
}
],
"source": [
"#% of queries aiming to recent data\n",
"print(len(data_c.loc[data_c.duration_hour <= 48])/len(data_c)*100)\n",
"#% of queries with short time spans\n",
"print(len(data_c.loc[data_c.data_age_hour <= 24])/len(data_c)*100)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Generate Server, Table and Column number"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"#data_c.loc[~data_c['srctree'].isnull(),'group'] = data_c.loc[~data_c['srctree'].isnull(),'srctree'].str.split(',',expand=True)[0]\n",
"#data_c.loc[~data_c['srctree'].isnull(),'server'] = data_c.loc[~data_c['srctree'].isnull(),'group'].str.split('__',expand=True)[0]\n",
"#data_c.loc[~data_c['srctree'].isnull(),'database'] = data_c.loc[~data_c['srctree'].isnull(),'group'].str.split('__',expand=True)[1]\n",
"\n",
"data_c.loc[(data_c['db_group']=='-3')&(~data_c['srctree'].isnull()),'table_number'] = data_c.loc[(data_c['db_group']=='-3')&(~data_c['srctree'].isnull())].apply(lambda data: get_len(data['srctree']), axis = 1)\n",
"\n",
"data_c.loc[(data_c['db_group']=='-3')&(~data_c['srctree'].isnull()),'column_number'] = data_c.loc[(data_c['db_group']=='-3')&(~data_c['srctree'].isnull())].apply(lambda data: get_col_nb(data['srctree']), axis = 1)\n",
"\n",
"data_c.loc[data_c['db_group']!='-3','column_number'] = data_c.loc[data_c['db_group']!='-3'].apply(lambda data: get_col_nb_novirtual(data['db_mask']), axis = 1)\n",
"data_c.loc[data_c['db_group']!='-3','table_number'] = 1\n",
"data_c = data_c[~data_c.table_number.isnull()]\n",
"data_c = data_c[data_c.column_number != 'All']\n",
"data_c['column_number'] = data_c['column_number'].astype(int) \n",
"\n",
"data_c.loc[~data_c['srctree'].isnull(),'server_number'] = data_c.loc[~data_c['srctree'].isnull()].apply(lambda data: get_server_number(data['srctree']), axis = 1)\n",
"data_c.loc[data_c['srctree'].isnull(),'server_number'] = 1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Binning the column number"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"data_c.loc[data_c.column_number == 1, 'col_bin'] = '1'\n",
"data_c.loc[(data_c.column_number >=2) & (data_c.column_number <=5), 'col_bin'] = '2-5'\n",
"data_c.loc[(data_c.column_number >5) & (data_c.column_number <=10), 'col_bin'] = '6-10'\n",
"data_c.loc[(data_c.column_number >10) & (data_c.column_number <=50), 'col_bin'] = '11-50'\n",
"data_c.loc[(data_c.column_number >50) & (data_c.column_number <=100), 'col_bin'] = '51-100'\n",
"data_c.loc[data_c.column_number > 100, 'col_bin'] = '100+'\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"# latency sec\n",
"data_c['Latency_sec'] = data_c['Latency']/1000000\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Generate Query Status column"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"D:\\Programs\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:4: SettingWithCopyWarning:\n",
"\n",
"\n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
"\n",
"D:\\Programs\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:1598: SettingWithCopyWarning:\n",
"\n",
"\n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
"\n",
"D:\\Programs\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:1719: SettingWithCopyWarning:\n",
"\n",
"\n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
"\n",
"D:\\Programs\\anaconda3\\lib\\site-packages\\pandas\\core\\strings\\accessor.py:101: UserWarning:\n",
"\n",
"This pattern has match groups. To actually get the groups, use str.extract.\n",
"\n",
"D:\\Programs\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:16: SettingWithCopyWarning:\n",
"\n",
"\n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
"\n"
]
}
],
"source": [
"# content exists only in update requests\n",
"columns = ['Timestamp', 'Setup', 'db_server', 'db_mask', 'Source', 'Request', 'PID', 'ClientID', 'db_name', 'db_group', 'srctree', 'window', 'resample', 'Latency', 'width', 'height', 'SessionID', 'Content']\n",
"data_latency = katrin_update_data[columns]\n",
"data_latency['Timestamp'] = pd.to_datetime(data_latency['Timestamp'])\n",
"\n",
"data_latency.loc[data_latency['Content'].str.contains('Timeout locking file'),'Status'] = 'LockingFileError'\n",
"data_latency.loc[data_latency['Content'].str.contains('DRAW class have not access to the temporary directory'),'Status'] = 'AccessDirectoryError'\n",
"data_latency.loc[data_latency['Content'].str.contains('INTERVAL window have undefined size'),'Status'] = 'WindowSizeError'\n",
"data_latency.loc[data_latency['Content'].str.contains('(timeout expired)'),'Status'] = 'TimeoutError'\n",
"data_latency.loc[data_latency['Content'].str.contains('Connection to the Caching MySQL Server is failed'),'Status'] = 'MYSQLCachingError'\n",
"data_latency.loc[data_latency['Content'].str.contains('find any column'),'Status'] = 'ColumnNotFoundError'\n",
"data_latency.loc[data_latency['Content'].str.contains('The graph dimmensions'),'Status'] = 'GraphDimensionError'\n",
"data_latency.loc[data_latency['Content'].str.contains('SQLDriverConnect'),'Status'] = 'SQLError'\n",
"data_latency.loc[data_latency['Content'].str.contains('New request is received'),'Status'] = 'Received'\n",
"data_latency.loc[data_latency['Content'].str.contains('Request processing is finished'),'Status'] = 'Finished'\n",
"data_latency['Latency_sec'] = data_latency['Latency']/1000000\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Expand server names"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"data_c.loc[~data_c['srctree'].isnull(),'servers'] = data_c.loc[~data_c['srctree'].isnull()].apply(lambda data: get_servers(data['srctree']), axis = 1)\n",
"data_c.loc[data_c.db_server != 'virtual', 'servers'] = data_c.loc[data_c.db_server != 'virtual', ['db_server']]\n",
"expanded_servers = data_c['servers'].explode('servers').reset_index(drop=True)\n",
"expanded_servers = expanded_servers.to_frame()\n",
"expanded_servers = expanded_servers.loc[~expanded_servers.servers.isnull()]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Analysis"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Plot requests per type"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"plotlyServerURL": "https://plot.ly"
},
"data": [
{
"alignmentgroup": "True",
"hoverlabel": {
"namelength": 0
},
"hovertemplate": "Request_Type=%{x}
Count=%{y}",
"legendgroup": "",
"marker": {
"color": "#75BDA7"
},
"name": "",
"offsetgroup": "",
"orientation": "v",
"showlegend": false,
"textposition": "auto",
"type": "bar",
"x": [
"data analysis",
"data export"
],
"xaxis": "x",
"y": [
2769450,
9795
],
"yaxis": "y"
}
],
"layout": {
"barmode": "relative",
"height": 400,
"legend": {
"tracegroupgap": 0
},
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmap"
}
],
"heatmapgl": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmapgl"
}
],
"histogram": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "#E5ECF6",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "#E5ECF6",
"polar": {
"angularaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"radialaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"yaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"zaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"baxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"caxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
}
}
},
"title": {
"text": "Requests per query type"
},
"width": 600,
"xaxis": {
"anchor": "y",
"domain": [
0,
1
],
"title": {
"text": "Query Type"
}
},
"yaxis": {
"anchor": "x",
"domain": [
0,
1
],
"title": {
"text": "Count"
}
}
}
},
"text/html": [
"