adarsh_notes.txt

Some setup notes:

1. Create virtual env or Conda(preferred, see notes below)
mkdir virtualenv
cd virtualenv/
sudo apt install virtualenv
virtualenv -p python py-keras
source py-keras/bin/activate

---install conda
https://www.digitalocean.com/community/tutorials/how-to-install-anaconda-on-ubuntu-18-04-quickstart
download conda sh script from https://www.anaconda.com/distribution/
go to download folder
bash <anaconda sh file name>

--for conda setup
source ~/anaconda3/etc/profile.d/conda.sh
conda config --set auto_activate_base false
conda create -n test6 distributed=2021.6.0 click=7.0
conda activate test6

pip3 install -q -U pip setuptools wheel

2. Install Keras
pip3 install keras==2.0.8

3. Install tensorflow
pip3 install tensorflow_gpu==1.3

--for conda setup
pip3 install tensorflow_gpu==1.15.5

4. Install opencv
pip3 install opencv-python==4.2.0.32

5. install pydicom
pip3 install pydicom

6. install sklearn
pip3 install sklearn

7. install h5py
pip3 install h5py

8. set python path
export PYTHONPATH=$PYTHONPATH:/home/adarshsehgal/workspace/GA-mammograms

9. install pyspark for parallel ga computing
pip3 install pyspark

10. install jmetal
pip3 install "jmetalpy[distributed]"

11. install java
sudo apt install openjdk-8-jdk

12. install python distributed
sudo apt install python3-distributed

13. install ipython
sudo apt install ipython


15. install xgboost
pip3 install xgboost

14. install paramiko
pip3 install paramiko

15. install openssh-server - helps to enable ssh
sudo apt install openssh-server
sudo ufw allow ssh

16. install asyncssh
pip3 install asyncssh

17. install nvidia drivers
https://towardsdatascience.com/deep-learning-gpu-installation-on-ubuntu-18-4-9b12230a1d31
sudo apt-get install python3-apt

cd /usr/lib/python3/dist-packages
sudo cp apt_pkg.cpython-36m-x86_64-linux-gnu.so apt_pkg.so

sudo apt-get install ubuntu-drivers-common
sudo ubuntu-drivers autoinstall
sudo apt install intel-gpu-tools

17. install cuda
https://towardsdatascience.com/deep-learning-gpu-installation-on-ubuntu-18-4-9b12230a1d31
IMP: install cuda 10.0/10.1
https://developer.nvidia.com/cuda-10.0-download-archive?target_os=Linux&target_arch=x86_64&target_distro=Ubuntu&target_version=1804&target_type=runfilelocal
sudo sh cuda_10.0.130_410.48_linux.run
OR
sudo sh cuda_10.1.105_418.39_linux.run

--extras - Don't follow this
wget https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda_11.6.2_510.47.03_linux.run
sudo sh cuda_11.6.2_510.47.03_linux.run --toolkit --silent --override

17. install CudaDNN
IMP: install version 7.4 corresponding to tensorflow 1.15.5
copy versions 7.4 and 7.6 cuDNN files to /usr/local/cuda
https://towardsdatascience.com/deep-learning-gpu-installation-on-ubuntu-18-4-9b12230a1d31
sudo apt-get install libcupti-dev

$ sudo cp cuda/include/cudnn.h /usr/local/cuda/include
$ sudo cp cuda/lib64/libcudnn* /usr/local/cuda/lib64
$ sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*

===using cuda 10.0 and 10.1 together
sudo cp /usr/local/cuda-10.0/lib64/libcudart.so.10.0 /usr/local/cuda/lib64
sudo cp /usr/local/cuda-10.0/lib64/libcublas.so.10.0 /usr/local/cuda/lib64
sudo cp /usr/local/cuda-10.0/lib64/libcufft.so.10.0 /usr/local/cuda/lib64
sudo cp /usr/local/cuda-10.0/lib64/libcurand.so.10.0 /usr/local/cuda/lib64
sudo cp /usr/local/cuda-10.0/lib64/libcusolver.so.10.0 /usr/local/cuda/lib64
sudo cp /usr/local/cuda-10.0/lib64/libcusparse.so.10.0 /usr/local/cuda/lib64
sudo cp /usr/local/cuda-10.0/lib64/libcusparse.so.10.0 /usr/local/cuda/lib64

17. include path in ~/.bashrc
if [ -z $LD_LIBRARY_PATH ]; then
  LD_LIBRARY_PATH=/usr/local/cuda-10.0/lib64:/usr/local/cuda-10.0/lib64
else
  LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-10.0/lib64:/usr/local/cuda-10.0/lib64
fi
export LD_LIBRARY_PATH

source ~/.bashrc

17. install jupyter-server-proxy
pip3 install jupyter-server-proxy

18. install yarn - cluster manager
sudo apt install yarn

19. Allow root ssh login
The default setting in Debian (and hence Ubuntu) for OpenSSH Server is to deny password-based login for root and allow only key-based login. Change this line in /etc/ssh/sshd_config:

PermitRootLogin without-password
to

PermitRootLogin yes
PasswordAuthentication yes

And restart the SSH server:

sudo service ssh restart

20. Update python3 version to 3.7 on ubuntu 18, outside conda environment
https://dev.to/serhatteker/how-to-upgrade-to-python-3-7-on-ubuntu-18-04-18-10-5hab

21. setting up ssh cluster
1. to connect to localhost
try ssh localhost, accept the key
try ssh root@localhost, accept the key

do ^ for all workers and accept key

-set root password
sudo passwd

ssh-keygen
ssh-copy-id -i root@ip_address
ssh root@ip_address

22. Setup spark master
help link: https://stackoverflow.com/questions/33150147/master-must-start-with-yarn-spark
https://medium.com/ymedialabs-innovation/apache-spark-on-a-multi-node-cluster-b75967c8cb2b
https://spark.apache.org/downloads.html

Edit hosts file.
$ sudo gedit /etc/hosts
Now add entries of master and slaves in hosts file.
<MASTER-IP> master
<SLAVE01-IP> slave01
<SLAVE02-IP> slave02

23.
$ cd ~/Downloads
$ wget -c https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
$ cd /tmp
$ tar zxf ~/Downloads/spark-3.2.1-bin-hadoop3.2.tgz
sudo mv spark-3.2.1-bin-hadoop3.2 /usr/local/spark
$ cd /usr/local/spark/conf

24.
sudo gedit slaves
-And add the following entries.
master
slave01
slave02
slave02

-- this shows two workers on each computer

25.
Start Spark Cluster
$ cd /usr/local/spark
$ ./sbin/start-all.sh

To stop the spark cluster, run the following command on master.
$ cd /usr/local/spark
$ ./sbin/stop-all.sh

26. submit to spark (or just run python3 ga_jmetal.py)
send conda environment with spark.submit
conda pack -o test7_env.tar.gz

./bin/spark-submit   --master spark://adarshsehgal-ubuntu-18-2:7077 \
 --archives /home/adarshsehgal/workspace/GA-mammograms/ddsm_train/test7_env.tar.gz#environment \
 /home/adarshsehgal/workspace/GA-mammograms/ddsm_train/ga_jmetal.py

27. install dependencies in local ubuntu
sudo apt-get install python3-pip
pip3 install Cython
sudo apt install python-dev
sudo apt install gfortran
sudo apt-get install python3.7-dev
pip3 install numpy
pip3 install pybind11
pip3 install -q -U pip setuptools wheel
pip3 install scipy

28. specify spark host name in conf file
spark-env.sh in conf folder
SPARK_MASTER_HOST=192.168.0.152

29. edit /etc/hosts
remove 12.0.1.1 entries from both master and worker

30. install gpustat
pip3 install gpustat

31. monitor gpu utilization
sudo intel_gpu_top

32. install cuda toolkit
sudo apt install nvidia-cuda-toolkit

33. enable GPU for spark
https://nvidia.github.io/spark-rapids/Getting-Started/
https://github.com/apache/spark/blob/master/docs/spark-standalone.md#resource-allocation-and-configuration-overview
https://nvidia.github.io/spark-rapids/docs/get-started/getting-started-on-prem.html

jars can be downloaded from https://nvidia.github.io/spark-rapids/docs/download.html
jar name = cudf-0.19.2-cuda10-1.jar, rapids-4-spark_2.12-0.5.0.jar

add to ~/.bashrc
export SPARK_RAPIDS_DIR=/home/adarshsehgal/workspace/GA-mammograms/ddsm_train
export SPARK_CUDF_JAR=${SPARK_RAPIDS_DIR}/cudf-0.14-cuda10-1.jar
export SPARK_RAPIDS_PLUGIN_JAR=${SPARK_RAPIDS_DIR}/rapids-4-spark_2.12-0.1.0.jar

34. update spark-env.sh
add discovery script path
SPARK_WORKER_OPTS="-Dspark.worker.resource.gpu.amount=6 -Dspark.worker.resource.gpu.discoveryScript=/home/adarshsehgal/workspace/GA-mammograms/ddsm_train/getGpusResources.sh"
=========================================================================================
Some IMPORTANT points:
1. MAIN SCRIPT FILE: train_image_clf_inbreast.sh
./train_image_clf_inbreast.sh

2. white_list_formats = {'png', 'jpg', 'jpeg', 'bmp'}
    only these format images can be used for train/test/val

3. train/test/val folders should have folders named 'neg' (meaning negative) and 'pos' (meaning positive)
    these are classes, names of which are inserted in variable name called '--class-list'

4. Inbreast dataset link
    https://drive.google.com/file/d/19n-p9p9C0eCQA1ybm6wkMo-bbeccT_62/view

5. Use inbreast_preprocess_script.py, to split the dicom images to positive/negative images based on Bi-Rads assessment categories
    Negative - 1/2
    Positive - 4/5/6

6. Use inbreast_preprocess_script.py to convert DICOM to PNG images

7. --batch-size should be changed based on number of images used for testing, default was set to 4
--all-layer-epochs controls how many epochs you want to run the training for

8. Use inbreastbuilder.m/inbreastbuilder.py with read_mixed_csv.m/read_mixed_csv.py files to process inbreast database
========================================================================================

How to run the code:
1. Activate virtual env
cd virtualenv/
source py-keras/bin/activate

2.  set python path
export PYTHONPATH=$PYTHONPATH:/home/adarsh/workspace/GA-mammograms

3. Go to code directory
cd && cd workspace/GA-mammograms/ddsm_train/

4. Execute the script
./train_image_clf_inbreast.sh


======================================================================================

STEPS TO PROGRAM GA-E2E ALGORITHM:

1. Preprocess Inbreast dataset
    - write matlab code to split positive and negative images
    - write python code to convert dicom to png format
2. install required packages for E2E
3. split preprocessed png images to train/val/test sub-sets
    - ALL are CC view images
    - ALL images belong to different patients

    train sub-set: 4 neg, 4 pos
    val sub-set: 2 neg, 2 pos
    test sub-set: 4 neg, 4 pos

4. Program ga.py to create a string of query to be run for fitness function
    - return of fitness function is AUC because we want to maximize AUC
    - consider AUC as the one from test set
    - Approx 160 s for each epoch
    - Total of 4 epochs per training - to save time
    - Approx 11 minutes per fitness function evaluation

5. Program to plot AUC vs times fitness function evaluated - plot_rewards.py
    - This will plot AUC on y axis, times fitness function evaluated on x axis.
    - This signifies overall increase in AUC over fitness function runs
    - Further signifying moving of search space in promising direction
    - plot, time taken to run over fitness function evaluations
=====================================================================================

EXPERIMENTS PERFORMED

1. pos/neg having multiple views of same patient
    -train/val AUC=1.0
    -test AUC=0.5
    -train sub-set: 4 neg, 4 pos
    -val sub-set: 2 neg, 2 pos
    -test sub-set: 4 neg, 4 pos
    -Training uses 4 epochs
    -num_cpu=4
    -160 s per epoch

2. pos/neg having only CC views of different patients
    -train sub-set: 4 neg, 4 pos
    -val sub-set: 2 neg, 2 pos
    -test sub-set: 4 neg, 4 pos
    -Training uses 4 epochs
    -For training, epoch 1/2 AUC 0.75, epoch3/4 AUC 1.0
    -test AUC=0.4375
    -num_cpu=4
    -160 s per epoch

3. pos/neg having only CC views of different patients
    -train sub-set: 4 neg, 4 pos
    -val sub-set: 2 neg, 2 pos
    -test sub-set: 4 neg, 4 pos
    -Training uses 4 epochs
    -init-learningrate 0.999
    -RESULT: change in init-learningrate, changes AUC

4. Running my old GA with 6 parameters
    - original AUC was 0.4375
    - GA found AUC 0.5
    - train sub-set: 4 neg, 4 pos
    - val sub-set: 2 neg, 2 pos
    - test sub-set: 4 neg, 4 pos
    - Training uses 4 epochs
    - NUM_GPU_CORE = 6
    - plot generated for AUC vs times fitness function evaluated
    - all parameters have ranges 0-0.999
    - Limited images due to system limitations
    - Code ran on ubuntu local machine
    - Google colab did not support tensorflow v1.3, which is needed to run the code

=====================================================================================

PARAMETERS FOR GA OPTIMIZATION:

-pos-cls-weight, range 0-1, 3 decimal places, default: 1.0
-neg-cls-weight, range 0-1, 3 decimal places, default: 1.0
-weight-decay, range 0-1, 3 decimal places, default: 0.001
-weight-decay2, range 0-1, 3 decimal places, default: 0.01

Best auc so far : 0.5625
BEST CHROMOSOME IS
[False False  True  True  True  True  True False False False  True  True
  True False  True False  True  True  True False False  True False  True
  True False False  True False False  True False False False  True  True
  True  True False  True False  True  True  True False False False False
  True False  True  True  True  True False  True False False False False
 False  True  True False  True False]
It's decoded value is
pos_cls_weight = 0.248
neg_cls_weight = 1.721
weight_decay = 0.804
weight_decay2 = 0.983
init_lr = 0.094
all_layer_multiplier = 0.026
--- 355461.121692 seconds ---


=================================================================================
-MIT jmetal implementation
https://github.com/jMetal/jMetalPy

-how to run jmetal in parallel
https://jmetal.github.io/jMetalPy/tutorials/evaluator.html
from jmetal.util.evaluator import MapEvaluator
from jmetal.util.evaluator import MultiprocessEvaluator
from jmetal.util.evaluator import SparkEvaluator

algorithm = NSGAII(
   problem=problem,
   population_size=100,
   offspring_population_size=100,
   ...
   population_evaluator = SparkEvaluator(processes=8),
 )

pip3 install pyspark

-how to install spark for parallel run
https://www.mytechmint.com/how-to-install-pyspark-with-java-8-on-ubuntu-18-04/

-Steps to setup workspace to run jMetal version of GA
1. create test/val folders in directory ddsm_train
2. update path in mammogram.py
3. create pos/neg folders in test/val folders
4. convert dicom images to png and save in pos/neg folders


-----Steps to use python dask scheduler
1. run dask-schduler on one machine-1
2. run dask-worker on remaining machines - dask-worker <Address from dash-scheduler>
3. run function with .compute()


$ conda create -n test distributed=2021.6.0 click=8.0.1 # this one worked
-tensorflow=1.3 not installing

conda create -n test6 distributed=2021.6.0 click=7.0 # this one worked
-pip3 install tensorflow==1.15.5


source ~/anaconda3/etc/profile.d/conda.sh
$ conda activate test
$ dask-scheduler --version
dask-scheduler, version 2021.6.0

-Monitor Ram usage
watch -n 5 free -m

-Monitor CPU usage
top


--Action items (4/24/2022):
1. save jmetal source code - Done
2. Plot generation wise results
3. increase number of epochs - set to 10
4. increase population size - set to 100
5. offspring population - set to 50
6. Set range of values of parameters - currently system is taking more than 10 digits after decimal

--New GA (4/24/2022)
1. parallel - 24 processes at one time
2. distributed
3. using new operators
4. setup local cluster at home with two computers connected via network switch
5. using python3-stributed package for dask
6. using dask-scheduler to connect multiple dask-workers on network
7. using 24 cores of cpu (12 cores on each computer)
8. dashboard monitor to check diagonstic information like cpu/memory usage
9. Per epoch speed increased, even with more number of images


--challenges (4/24/2022)
1. setting up cluster using network switch (computers on dr. louis cluster were too slow and had version problems)
2. dask distributed version on different computers
3. tensorflow version selection
4. selecting python version that works on both distributed and tensorflow
5. finding suitable number of parallel processes possible (24 processes at a time did not workn then reduced to 12)


-- setting up ssh cluster
1. to connect to localhost
try ssh localhost, accept the key

ssh-keygen
ssh-copy-id -i root@ip_address
ssh root@ip_address

2. follow same for other ssh machines

--kill processes on port 8786
sudo lsof -t -i tcp:8786 | xargs kill -9

----------------------------------
Setup spark master
help link: https://stackoverflow.com/questions/33150147/master-must-start-with-yarn-spark

$ cd ~/Downloads
$ wget -c https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
$ cd /tmp
$ tar zxf ~/Downloads/spark-3.2.1-bin-hadoop3.2.tgz
$ cd spark-3.2.1-bin-hadoop3.2/

-start master
$ sbin/start-master.sh

-start slave
$ sbin/start-slave.sh --master spark://ego-server:7077

-setting up spark cluster with hadoop
https://dzone.com/articles/setting-up-multi-node-hadoop-cluster-just-got-easy-2

IMportant steps to setup spark cluster
https://medium.com/ymedialabs-innovation/apache-spark-on-a-multi-node-cluster-b75967c8cb2b

--running python file on already  built spark cluster
./bin/spark-submit   --master spark://192.168.0.152:7077   /home/adarshsehgal/workspace/GA-mammograms/ddsm_train/ga_jmetal.py
./bin/spark-submit   --master spark://adarshsehgal-Alienware-Aurora-R7:7077  /home/adarshsehgal/workspace/GA-mammograms/ddsm_train/ga_jmetal.py

--setup hadoop yarn
https://phoenixnap.com/kb/install-hadoop-ubuntu

--install python3.7 on ubuntu 18
https://linuxize.com/post/how-to-install-python-3-7-on-ubuntu-18-04/

==================================================================
KNOWN ERRORS

1.
--File "/usr/lib/python3/dist-packages/toolz/functoolz.py", line 501
    f.__name__ for f in reversed((self.first,) + self.funcs),
    ^
SyntaxError: Generator expression must be parenthesized

SOLUTION: use functoolz.py file in ddsm_train folder

2. software & update not opening
sudo apt autoremove gnome-software && sudo apt install gnome-software
sudo apt update && sudo apt dist-upgrade -f