Skip to content
This repository has been archived by the owner on Mar 28, 2023. It is now read-only.

Analytics

skoulouzis edited this page Oct 24, 2015 · 15 revisions

Get the IP address with the most requests

select remoteAddr, count(*) as count, (sum(elapsedTime) *1.66667e-5) as elapsed_sum from requests_table where methodName = 'GET' group by remoteAddr order by count DESC;

Show elapsed time and request count from specific IP

Create and fill calendar table to join with requests

DROP PROCEDURE IF EXISTS FillCalendar;

DROP TABLE IF EXISTS calendar;

CREATE TABLE IF NOT EXISTS calendar(calendar_date DATE NOT NULL PRIMARY KEY, INDEX(calendar_date));

DELIMITER $$ CREATE PROCEDURE FillCalendar(start_date DATE, end_date DATE) BEGIN DECLARE crt_date DATE; SET crt_date = start_date; WHILE crt_date <= end_date DO INSERT IGNORE INTO calendar VALUES(crt_date); SET crt_date = ADDDATE(crt_date, INTERVAL 1 DAY); END WHILE; END$$ DELIMITER ;

CALL FillCalendar('2013-10-01', '2015-10-31');

Create temporary table to hold the results. Joining the requests_table and calendar takes for ever.

DROP TABLE IF EXISTS requests_table_tmp;

CREATE TEMPORARY TABLE IF NOT EXISTS requests_table_tmp ( uid SERIAL PRIMARY KEY, req_count INT, elapsed_sum DOUBLE, req_date TIMESTAMP , INDEX(req_date) ) ENGINE=InnoDB;

Run the query and insert it to the requests_table_tmp.

TRUNCATE TABLE requests_table_tmp;

insert into requests_table_tmp (req_count,elapsed_sum,req_date) select count(*) as req_num, (sum(elapsedTime) *1.66667e-5) as elapsed_sum, timeStamp as req_date from requests_table where methodName = 'GET' and userName != 'tester' and remoteAddr ='192.168.100.1' GROUP BY YEAR(timeStamp), MONTH(timeStamp), DAY(timeStamp);

Now do the join.

SELECT calendar_date, req_count, elapsed_sum from calendar c left join requests_table_tmp pi on date(c.calendar_date) = date(pi.req_date) GROUP BY date(c.calendar_date);

Parse logs with logstash

If you want to parse logs created by LOBCDER for logstash and store them in elasticsearch you can use the configuration file in https://github.com/skoulouzis/lobcder/blob/dev/lobcder-tests/etc/logstash-lobcder1.2.2.conf. You will need to edit the configuration file and set the paths that correspond to your setup.

#Query Elasocsearch To export usage data to csv use these commands.

##Uploads for((i=0;i<=2000000;i+=1000)); do curl "localhost:9200/_all/_search?q=rx_size:*&from=$i&size=1000" | jq --raw-output '.hits.hits | .[] | ._source | "\(.logdate)\t\(.rx_source)\t\(.rx_destination)\t\(.rx_speed)\t\(.rx_size)\t\(.rx_sourceLoc.country_code2)\t\(.rx_sourceLoc.country_name)"'; done >> rx_size.csv

##Downloads for((i=0;i<=2000000;i+=1000)); do curl "http://localhost:9200/_all/_search?q=tx_size:*&from=$i&size=1000" | jq --raw-output '.hits.hits | .[] | ._source | "\(.logdate)\t\(.tx_source)\t\(.tx_destination)\t\(.tx_speed)\t\(.tx_size)\t\(.tx_destinationLoc.country_code2)\t\(.tx_destinationLoc.country_name)"'; done >> tx_size.csv

##Replication for((i=0;i<=2000000;i+=1000)); do curl "http://localhost:9200/_all/_search?q=repl_source:*&from=$i&size=1000" | jq --raw-output '.hits.hits | .[] | ._source | "\(.logdate)\t\(.repl_source)\t\(.repl_destination)\t\(.repl_speed)\t\(.repl_size)"' && sleep 1; done >> repl_size.csv

##All client requests for((i=0;i<=2000000;i+=1000)); do curl "http://localhost:9200/_all/_search?q=req_source:*&from=$i&size=1000" | jq --raw-output '.hits.hits | .[] | ._source | "\(.logdate)\t\(.req_sourceLoc.country_code2)\t\(.req_sourceLoc.country_name)\t\(.req_sourceLoc.city_name)\t\(.req_userAgent)\t\(.req_user)\t\(.req_verb)\t\(.req_contLen)\t\(.req_contType)\t\(.req_elapsed)\t\(.req_contType)\t\(.req_queryString)\t\(.req_URL)"'; done >> req_size.csv

For specific dates for example all of October :

for((i=0;i<=2000000;i+=1000)); do curl "http://localhost:9200/logstash-2015.10.*/_search?q=req_source:*&from=$i&size=1000" | jq --raw-output '.hits.hits | .[] | ._source | "\(.logdate)\t\(.req_sourceLoc.country_code2)\t\(.req_sourceLoc.country_name)\t\(.req_sourceLoc.city_name)\t\(.req_userAgent)\t\(.req_user)\t\(.req_verb)\t\(.req_contLen)\t\(.req_contType)\t\(.req_elapsed)\t\(.req_contType)\t\(.req_queryString)\t\(.req_URL)"'; done >> req_size.csv

To list all indexes (dates in our case) in elasicsearch: curl 'localhost:9200/_cat/indices?v'

To check the status of elasticsearch: curl -XGET 'http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=5s&pretty=true'

##Import the exported csv files to mysql Create the tables.

For uploads:

CREATE TABLE rx_stats ( logdate DATE, rx_source VARCHAR(5240), rx_destination VARCHAR(5240), rx_speed DOUBLE, rx_SIZE DOUBLE ) ENGINE=InnoDB;

For downloads:

CREATE TABLE tx_stats ( logdate DATE, tx_source VARCHAR(5240), tx_destination VARCHAR(5240), tx_speed DOUBLE, tx_SIZE DOUBLE ) ENGINE=InnoDB;

For replications:

CREATE TABLE repl_stats ( logdate DATE, repl_source VARCHAR(5240), repl_destination VARCHAR(5240), repl_speed DOUBLE, repl_SIZE DOUBLE ) ENGINE=InnoDB;

For requests:

CREATE TABLE req_stats ( logdate DATE, req_source_country_code2 VARCHAR(5240), req_source_country_name VARCHAR(5240), req_source_city_name VARCHAR(5240), req_userAgent VARCHAR(5240), req_user VARCHAR(5240), req_verb VARCHAR(5240), req_contLen BIGINT, req_contType VARCHAR(5240), req_elapsed DOUBLE, req_contType1 VARCHAR(5240), req_queryString VARCHAR(5240), req_URL VARCHAR(5240) ) ENGINE=InnoDB;

Import the csv. For uploads:

LOAD DATA LOCAL INFILE 'rx_size.csv' INTO TABLE repl_stats FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' (@var1,rx_source,rx_destination,rx_speed,rx_size) set logdate = STR_TO_DATE(@var1, '%M %d,%Y %h:%i:%s %p')

For downloads:

LOAD DATA LOCAL INFILE 'tx_size.csv' INTO TABLE repl_stats FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' (@var1,tx_source,tx_destination,tx_speed,tx_size) set logdate = STR_TO_DATE(@var1, '%M %d,%Y %h:%i:%s %p')

For replications:

LOAD DATA LOCAL INFILE 'repl_size.csv' INTO TABLE repl_stats FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' (@var1,repl_source,repl_destination,repl_speed,repl_size) set logdate = STR_TO_DATE(@var1, '%M %d,%Y %h:%i:%s %p')

For requests:

LOAD DATA LOCAL INFILE 'req_size.csv' INTO TABLE req_stats FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' (@var1,req_source_country_code2,req_source_country_name,req_source_city_name,req_userAgent,req_user,req_verb,req_contLen,req_contType, req_elapsed,req_contType1,req_queryString,req_URL) set logdate = STR_TO_DATE(@var1, '%M %d,%Y %h:%i:%s %p')

Get the elapsed time between to GET requests from a specific IP

It's not pretty but it works.

DROP TABLE IF EXISTS requests_table_tmp1;

CREATE TEMPORARY TABLE IF NOT EXISTS requests_table_tmp1 ( uid SERIAL PRIMARY KEY, req_date TIMESTAMP , INDEX(req_date), folder varchar(512) ) ENGINE=InnoDB;

insert into requests_table_tmp1(uid,req_date,folder) select uid, timeStamp, SUBSTRING_INDEX(requestURL, '/', LENGTH(requestURL) - LENGTH(REPLACE(requestURL, '/', '')) ) as folder from requests_table where methodName = 'GET' and userName !='tester' and remoteAddr like '10.%' group by timeStamp;

DROP TABLE IF EXISTS requests_table_tmp2;

CREATE TEMPORARY TABLE IF NOT EXISTS requests_table_tmp2 ( uid SERIAL PRIMARY KEY, req_date TIMESTAMP , INDEX(req_date), folder varchar(512) ) ENGINE=InnoDB;

insert into requests_table_tmp2(uid,req_date,folder) select uid, timeStamp, SUBSTRING_INDEX(requestURL, '/', LENGTH(requestURL) - LENGTH(REPLACE(requestURL, '/', '')) ) as folder from requests_table where methodName = 'GET' and userName !='tester' and remoteAddr like '10.%' group by timeStamp;

select requests_table_tmp1.folder as fromFolder, requests_table_tmp2.folder as toFolder, (requests_table_tmp1.req_date - requests_table_tmp2.req_date) AS timedifference from requests_table_tmp1 JOIN requests_table_tmp2 on requests_table_tmp1.uid = requests_table_tmp2.uid+1