Skip to content

Commit

Permalink
Refactor and fix ansible-lint issues (Part 1)
Browse files Browse the repository at this point in the history
  • Loading branch information
Lun4m committed Oct 2, 2024
1 parent 67e564a commit cf1b54a
Show file tree
Hide file tree
Showing 47 changed files with 1,324 additions and 1,249 deletions.
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1 @@
target/*
ansible/roles/deploy/files/resources
ansible/roles/deploy/files/lard_ingestion
6 changes: 6 additions & 0 deletions ansible/.gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,7 @@
notes.txt
ansible.cfg
.yamlfmt
.run.sh

roles/deploy/files/resources
roles/deploy/files/lard_ingestion
40 changes: 8 additions & 32 deletions ansible/bigip.yml
Original file line number Diff line number Diff line change
@@ -1,52 +1,28 @@
- name: Copy schema for bigip
---
- name: Create what is needed for the bigip load balancers
hosts: servers
remote_user: ubuntu
vars:
ostack_cloud: lard
ostack_region: Ostack2-EXT
hosts: localhost # need to seperate this since done from localhost
gather_facts: false
pre_tasks:
# copy file, so we have an .sql file to apply locally
- name: Create a directory if it does not exist
- name: Create a directory if it does not exist
ansible.builtin.file:
path: /etc/postgresql/16/db/bigip
state: directory
mode: '0755'
become: true
delegate_to: '{{ hostvars[groups["servers"][0]].ansible_host }}'
remote_user: ubuntu

- name: Copy the schema to the remote 1
ansible.builtin.copy:
src: ./roles/bigip/vars/bigip.sql
dest: /etc/postgresql/16/db/bigip/bigip.sql
mode: '0755'
become: true
delegate_to: '{{ hostvars[groups["servers"][0]].ansible_host }}'
remote_user: ubuntu
- name: Create a directory if it does not exist
ansible.builtin.file:
path: /etc/postgresql/16/db/bigip
state: directory
mode: '0755'
become: true
delegate_to: '{{ hostvars[groups["servers"][1]].ansible_host }}'
remote_user: ubuntu
- name: Copy the schema to the remote 2
ansible.builtin.copy:
src: ./roles/bigip/vars/bigip.sql
dest: /etc/postgresql/16/db/bigip/bigip.sql
mode: '0755'
become: true
delegate_to: '{{ hostvars[groups["servers"][1]].ansible_host }}'
remote_user: ubuntu

- name: Create what is needed for the bigip load balancers
hosts: servers
remote_user: ubuntu
vars:
ostack_cloud: lard
ostack_region: Ostack2-EXT
gather_facts: false
# loops over both servers
roles:
- role: bigip
# will fail to create table in the standby (since read only)
- role: bigip
# will fail to create table in the standby (since read only)
14 changes: 7 additions & 7 deletions ansible/configure.yml
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
---
- name: Mount disks and install stuff on the VMs
hosts: servers
remote_user: ubuntu
vars:
ostack_cloud: lard
ipalias_network_name: ipalias
ostack_region: Ostack2-EXT
# loops over both servers
pre_tasks:
- name: List ansible_hosts_all difference from ansible_host (aka the vm not currently being iterated on)
ansible.builtin.debug:
msg: "{{ (ansible_play_hosts_all | difference([inventory_hostname])) | first }}"
msg: "{{ (ansible_play_hosts_all | difference([inventory_hostname])) | first }}"
roles:
- role: addsshkeys
- role: addsshkeys
- role: vm_format
vars:
name_stuff: '{{ inventory_hostname }}' # name of current vm for finding ipalias port
name_stuff: "{{ inventory_hostname }}" # name of current vm for finding ipalias port
- role: ssh
vars:
vm_ip: '{{ ansible_host }}' # the current vm's ip
vm_ip: "{{ ansible_host }}" # the current vm's ip

- name: Setup primary and standby
vars:
Expand All @@ -26,10 +26,10 @@
hosts: localhost
gather_facts: false

roles:
roles:
- role: primarystandbysetup
vars:
primary_name: lard-a
primary_ip: '{{ hostvars[groups["servers"][0]].ansible_host }}' # the first one is a
standby_name: lard-b
standby_ip: '{{ hostvars[groups["servers"][1]].ansible_host }}' # the second one is b
standby_ip: '{{ hostvars[groups["servers"][1]].ansible_host }}' # the second one is b
7 changes: 3 additions & 4 deletions ansible/deploy.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
---
- name: Deploy binaries
hosts: lard-a # deploy to lard-a for now, might need to define a separate VM later?
remote_user: ubuntu
vars:
ostack_cloud: lard
ostack_region: Ostack2-EXT
become: true
become_user: root
# become: true
# become_user: root
roles:
- role: deploy
# vars:
# deploy_ip: '{{ hostvars[...].ansible_host }}'
Empty file added ansible/group_vars/servers.yaml
Empty file.
2 changes: 2 additions & 0 deletions ansible/host_vars/lard-a.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
---
ansible_host: 123.123.123.123
2 changes: 2 additions & 0 deletions ansible/host_vars/lard-b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
---
ansible_host: 123.123.123.123
4 changes: 3 additions & 1 deletion ansible/inventory.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
---
servers:
hosts:
# TODO: add flag to filter when server is primary/secondary?
lard-a:
ansible_host: 157.249.*.*
lard-b:
ansible_host: 157.249.*.*
ansible_host: 157.249.*.*
11 changes: 6 additions & 5 deletions ansible/provision.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
- name: setup networks and 2 vms
---
- name: Setup networks and 2 vms
vars:
ostack_cloud: lard
ipalias_network_name: ipalias
Expand All @@ -10,11 +11,11 @@
- role: networks
- role: vm # in A
vars:
name_stuff: lard-a
availability_zone: ext-a
vm_name: lard-a
vm_availability_zone: ext-a
vm_ip: '{{ hostvars[groups["servers"][0]].ansible_host }}'
- role: vm # in B
vars:
name_stuff: lard-b
availability_zone: ext-b
vm_name_stuff: lard-b
vm_availability_zone: ext-b
vm_ip: '{{ hostvars[groups["servers"][1]].ansible_host }}'
101 changes: 56 additions & 45 deletions ansible/readme.md
Original file line number Diff line number Diff line change
@@ -1,81 +1,84 @@
## README for LARD setup on openstack(2)

#### Useful ansible commands:

```
ansible-inventory -i inventory.yml --graph
ansible servers -m ping -u ubuntu -i inventory.yml
```

#### Dependencies to install
```
pip3 install wheel # so you can allow downloading of binary python packages
pip install -r requirements.txt
ansible-galaxy collection install openstack.cloud
ansible-galaxy collection install community.postgresql
ansible-galaxy collection install community.general

ansible-galaxy collection install ansible.posix
```terminal
python3 -m venv {your_dir}
source {your_dir}/bin/activate
ansible-galaxy collection install ansible.utils
```
pip install -r requirements.txt
ansible-galaxy collection install -fr requirements.yml
```

### Get access to OpenStack
You need to create application credentials in the project you are going to create the instances in, so that the ansible scripts can connect to the right ostack_cloud which in our case needs to be called lard.

The file should exist here:
~/.config/openstack/clouds.yml
You need to create application credentials in the project you are going to
create the instances in, so that the ansible scripts can connect to the right
ostack_cloud which in our case needs to be called lard.

The file should exist in `~/.config/openstack/clouds.yml`.

If have MET access see what is written at the start of the readme here:
https://gitlab.met.no/it/infra/ostack-ansible21x-examples
If have MET access see what is written at the start of the readme [here](https://gitlab.met.no/it/infra/ostack-ansible21x-examples).

Or in the authentication section here:
https://gitlab.met.no/it/infra/ostack-doc/-/blob/master/ansible-os.md?ref_type=heads
Or in the authentication section [here](https://gitlab.met.no/it/infra/ostack-doc/-/blob/master/ansible-os.md?ref_type=heads).

### Add your public key to the Ostack GUI
Go to "Compute" then "Key Pairs" and import your public key for use in the provisioning step.

Go to "Compute" then "Key Pairs" and import your public key for use in the provisioning step.

### Provision!
The IPs in inventory.yml should correspond to floating ips you have requested in the network section of the open stack GUI. If you need to delete the old VMs (compute -> instances) and Volumes (volumes -> volumes) you can do so in the ostack GUI. *For some reason when deleting things to build up again one of the IPs did not get disassociated properly, and I had to do this manually (network -> floating IPs).*

The vars for the network and addssh tasks are encrypted with ansible-vault (ansible-vault decrypt roles/networks/vars/main.yml, ansible-vault decrypt roles/addshhkeys/vars/main.yml, ansible-vault decrypt roles/vm_format/vars/main.yml).
But if this has been setup before in the ostack project, these have likely already been run and therefore already exits so you could comment out this role from provision.yml.
Passwords are in ci_cd variables https://gitlab.met.no/met/obsklim/bakkeobservasjoner/lagring-og-distribusjon/db-products/poda/-/settings/ci_cd
The IPs in inventory.yml should correspond to floating ips you have requested in the network section of the open stack GUI. If you need to delete the old VMs (compute -> instances) and Volumes (volumes -> volumes) you can do so in the ostack GUI. *For some reason when deleting things to build up again one of the IPs did not get disassociated properly, and I had to do this manually (network -> floating IPs).*

```
The vars for the network and addssh tasks are encrypted with ansible-vault
(ansible-vault decrypt roles/networks/vars/main.yml, ansible-vault decrypt
roles/addshhkeys/vars/main.yml, ansible-vault decrypt
roles/vm_format/vars/main.yml). But if this has been setup before in the ostack
project, these have likely already been run and therefore already exits so you
could comment out this role from provision.yml. Passwords are in ci_cd
variables
https://gitlab.met.no/met/obsklim/bakkeobservasjoner/lagring-og-distribusjon/db-products/poda/-/settings/ci_cd

```terminal
ansible-playbook -i inventory.yml -e ostack_key_name=xxx provision.yml
```

After provisioning the next steps may need to ssh into the hosts, and thus you need to add them to your known hosts. Ansible appears to be crap at this, so its best to do it before running the next step by going:
`ssh [email protected].*.*`
`ssh [email protected].*.*`
For all the VMs.
If cleaning up from tearing down a previous set of VMs you may also need to remove them first:
`ssh-keygen -f "/home/louiseo/.ssh/known_hosts" -R "157.249.*.*"`

### Configure!
The third IP being passed in here is the one that gets associated with the primary, and moved when doing a switchover.
*NOTE:* The floating IP association times out, but this is ignored as it is a known bug.

```
The third IP being passed in here is the one that gets associated with the primary, and moved when doing a switchover.
*NOTE:* The floating IP association times out, but this is ignored as it is a known bug.

```term
ansible-playbook -i inventory.yml -e primary_floating_ip='157.249.*.*' -e db_password=xxx -e repmgr_password=xxx configure.yml
```

The parts to do with the floating ip that belongs to the primary (ipalias) are based on:
The parts to do with the floating ip that belongs to the primary (ipalias) are based on:
https://gitlab.met.no/ansible-roles/ipalias/-/tree/master?ref_type=heads

### Connect to database

```
PGPASSWORD=xxx psql -h 157.249.*.* -p 5432 -U lard_user -d lard
```

### Checking the cluster

Become postgres user: sudo su postgres

```
postgres@lard-b:/home/ubuntu$ repmgr -f /etc/repmgr.conf node check
Node "lard-b":
Expand All @@ -88,6 +91,7 @@ Node "lard-b":
Missing physical replication slots: OK (node has no missing physical replication slots)
Configured data directory: OK (configured "data_directory" is "/mnt/ssd-b/16/main")
```

```
postgres@lard-a:/home/ubuntu$ repmgr -f /etc/repmgr.conf node check
Node "lard-a":
Expand All @@ -101,12 +105,13 @@ Node "lard-a":
Configured data directory: OK (configured "data_directory" is "/mnt/ssd-b/16/main")
```

While a few of the configurations are found in /etc/postgresql/16/main/postgresql.conf (particularly in the ansible block at the end), many of them
While a few of the configurations are found in /etc/postgresql/16/main/postgresql.conf (particularly in the ansible block at the end), many of them
can only be seen in /mnt/ssd-b/16/main/postgresql.auto.conf (need sudo to see contents).

### Perform switchover
This should only be used when both VMs are up and running, like in the case of planned maintenance on one datarom.
Then we would use this script to switch the primary to the datarom that will stay available ahead of time.

This should only be used when both VMs are up and running, like in the case of planned maintenance on one datarom.
Then we would use this script to switch the primary to the datarom that will stay available ahead of time.

*Make sure you are aware which one is the master, and put the names the right way around in this call.*

Expand All @@ -118,8 +123,9 @@ This should also be possible to do manually, but might need to follow what is do
`repmgr standby switchover -f /etc/repmgr.conf --siblings-follow` (need to be postgres user)

### Promote standby (assuming the primary is down)
Make sure you are know which one you want to promote!
This is used in the case where the primary has gone down (e.g. unplanned downtime of a datarom).

Make sure you are know which one you want to promote!\
This is used in the case where the primary has gone down (e.g. unplanned downtime of a datarom).

**Manually:**
SSH into the standby
Expand All @@ -132,25 +138,28 @@ You can the check the status again (and now the old primary will say failed)
Then move the ip in the ostack gui (see in network -> floating ips, dissasociate it then associated it with the ipalias port on the other VM)

#### Later, when the old primary comes back up
The cluster will be in a slightly confused state, because this VM still thinks its a primary (although repmgr tells it the other one is running as a primary as well). If the setup is running as asynchronous we could lose data that wasn't copied over before the crash, if running synchronously then there should be no data loss.

The cluster will be in a slightly confused state, because this VM still thinks its a primary (although repmgr tells it the other one is running as a primary as well). If the setup is running as asynchronous we could lose data that wasn't copied over before the crash, if running synchronously then there should be no data loss.

SSH into the new primary
`repmgr -f /etc/repmgr.conf cluster show`
says:

- node "lard-a" (ID: 1) is running but the repmgr node record is inactive

SSH into the old primary
`repmgr -f /etc/repmgr.conf cluster show`
says:
- node "lard-b" (ID: 2) is registered as standby but running as primary

- node "lard-b" (ID: 2) is registered as standby but running as primary

With a **playbook** (rejoin_ip is the ip of the node that has been down and should now be a standby not a primary):

```
ansible-playbook -i inventory.yml -e rejoin_ip=157.249.*.* -e primary_ip=157.249.*.* rejoin.yml
```

Or **manually**:
Or **manually**:
Make sure the pg process is stopped (see fast stop command) if it isn't already

Become postgres user:
Expand All @@ -161,22 +170,24 @@ Perform a rejoin
`repmgr node rejoin -f /etc/repmgr.conf -d 'host=157.249.*.* user=repmgr dbname=repmgr connect_timeout=2' --force-rewind=/usr/lib/postgresql/16/bin/pg_rewind --verbose`

### for testing:
Take out one of the replicas (or can shut off instance in the openstack GUI):

Take out one of the replicas (or can shut off instance in the openstack GUI):
`sudo pg_ctlcluster 16 main -m fast stop`
For bringing it back up (or turn it back on):
`sudo pg_ctlcluster 16 main start`

### for load balancing at MET
This role creates a user and basic db for the loadbalancer to test the health of the db. Part of the role is allowed to fail on the secondary ("cannot execute ___ in a read-only transaction"), as it should pass on the primary and be replicated over. The hba conf change needs to be run on both.

The vars are encrypted, so run: ansible-vault decrypt roles/bigip/vars/main.yml
This role creates a user and basic db for the loadbalancer to test the health of the db. Part of the role is allowed to fail on the secondary ("cannot execute \_\_\_ in a read-only transaction"), as it should pass on the primary and be replicated over. The hba conf change needs to be run on both.

The vars are encrypted, so run: ansible-vault decrypt roles/bigip/vars/main.yml

Then run the bigip role on the VMs:

```
ansible-playbook -i inventory.yml -e bigip_password=xxx bigip.yml
```

### Links:
### Links:

https://www.enterprisedb.com/postgres-tutorials/postgresql-replication-and-automatic-failover-tutorial#replication
https://www.enterprisedb.com/postgres-tutorials/postgresql-replication-and-automatic-failover-tutorial#replication
Loading

0 comments on commit cf1b54a

Please sign in to comment.