From 02edc1ee9d4747b88b71f5ea66bc2f7c636a9ee9 Mon Sep 17 00:00:00 2001 From: Georgiy Lebedev Date: Mon, 4 Sep 2023 12:53:15 +0300 Subject: [PATCH] New network topology for firecracker VMs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, each firecracker VM needs to use a TAP network device, to route its packages into the network stack of the physical host. When saving and restoring a function instance, the tap device name and the IP address of the functions’ server, running inside the container, are preserved (see also the current requirements for vanilla firecracker snapshot loading [1]). This leads to networking conflicts on the host and limits the snapshot restoration to a single instance per physical machine. To bypass this obstacle, the following network topology is proposed: 1. A new network namespace (e.g.: VMns4) is created for each VM, in which the TAP device from the snapshotted VM is rebuilt and receives the original IP address of the function. The TAP device will broadcast all the incoming and outgoing packets to and from the serverless function and VM’s network interface. Each VM will run in its own network namespace, leading to no conflicts on the host due to networking resources. 2. A local virtual tunnel is established between the VM inside its network namespace and the host node via a virtual ethernet pair (veth). A link is then established between the two ends of the virtual ethernet pair, in the network namespace (veth4-0) and the host namespace (veth4-1). In contrast, the default vHive configuration sets up a similar forwarding system through network bridges. 3. Inside the network namespace we add a routing rule that redirects all packets via the veth VM end towards a default gateway (172.17.0.17). Thus, all packets sent by the function will show at the hosts’ end of the tunnel. 4. To avoid IP conflicts when routing the packets to and from functions, each VM is assigned a unique clone address (172.18.0.5). All packets leaving the VM end of the virtual ethernet pair get their source address rewritten to the clone address of the corresponding VM. Packets entering the host end of the virtual ethernet pair get their destination address written to the original address of the VM. As a result, each VM still thinks it is using the original address while in reality, its address is translated to a clone address, different for every VM. This is accomplished using two rules in the NAT table corresponding to the virtual namespace of the VM. One rule is added in the POSTROUTING chain and one in the PREROUTING chain. The POSTROUTING rule alters the network packets before they are sent out in the virtual tunnel, from the VM namespace to the host, and rewrites the IP source address of the packet. Similarly, the PREROUTING rule overwrites the destination address of incoming packets, before routing. The two ensure that packets going into the virtual namespace have their destination address the original IP address of the VM (172.16.0.2), while packets coming out of the namespace have their source address the clone IP address (172.18.05). The source IP address will remain the same for all the VM in the enhanced snapshotting mode, being set to 172.16.0.2 respectively. 5. In the routing table of the host, we add a rule that dictates that any package that has as destination IP the clone IP of a VM, will be routed towards the end of the tunnel situated in the corresponding network namespace, through a set gateway (172.17.0.18). This ensures that whenever packages arrive on the host for a VM, they will be sent down the right virtual tunnel instantaneously. 6. In the hosts NFT filter table we add 2 rules for the FORWARD chain, that allow traffic from the host end of the veth pair (veth4-1) to the default host interface (eno 49) and vice versa. Introduce a new networking management component for the topology described above. 1. https://github.com/firecracker-microvm/firecracker/blob/main/docs/snapshotting/snapshot-support.md#loading-snapshots Closes #797 Part of #794 Signed-off-by: Georgiy Lebedev --- .github/workflows/unit_tests.yml | 2 +- go.mod | 2 +- networking/Makefile | 33 ++ networking/networkManager.go | 253 +++++++++++++++ networking/networkconfig.go | 292 ++++++++++++++++++ networking/networking.go | 515 +++++++++++++++++++++++++++++++ networking/networking_test.go | 110 +++++++ 7 files changed, 1205 insertions(+), 2 deletions(-) create mode 100644 networking/Makefile create mode 100644 networking/networkManager.go create mode 100644 networking/networkconfig.go create mode 100644 networking/networking.go create mode 100644 networking/networking_test.go diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 1ed0e1ffd..4d1eed1e8 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -26,7 +26,7 @@ jobs: strategy: fail-fast: false matrix: - module: [taps, misc, profile] + module: [taps, misc, profile, networking] steps: - name: Set up Go 1.19 diff --git a/go.mod b/go.mod index 19fdd064c..c4f25cb0d 100644 --- a/go.mod +++ b/go.mod @@ -60,6 +60,7 @@ require ( github.com/stretchr/testify v1.8.0 github.com/vhive-serverless/vhive/examples/protobuf/helloworld v0.0.0-00010101000000-000000000000 github.com/vishvananda/netlink v1.1.1-0.20201029203352-d40f9887b852 + github.com/vishvananda/netns v0.0.0-20200728191858-db3c7e526aae github.com/wcharczuk/go-chart v2.0.1+incompatible golang.org/x/net v0.6.0 golang.org/x/sync v0.1.0 @@ -102,7 +103,6 @@ require ( github.com/opencontainers/runtime-spec v1.0.3-0.20200929063507-e6143ca7d51d // indirect github.com/opencontainers/selinux v1.8.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/vishvananda/netns v0.0.0-20200728191858-db3c7e526aae // indirect github.com/willf/bitset v1.1.11 // indirect go.opencensus.io v0.22.4 // indirect golang.org/x/image v0.7.0 // indirect diff --git a/networking/Makefile b/networking/Makefile new file mode 100644 index 000000000..62cde9ddf --- /dev/null +++ b/networking/Makefile @@ -0,0 +1,33 @@ +# MIT License +# +# Copyright (c) 2023 Georgiy Lebedev, Dmitrii Ustiugov, Plamen Petrov and vHive team +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +EXTRAGOARGS:=-v -race -cover + +test: + # Need to pass GOROOT because GitHub-hosted runners may have several + # go versions installed so that calling go from root may fail + sudo env "PATH=$(PATH)" "GOROOT=$(GOROOT)" go test ./ $(EXTRAGOARGS) + +test-man: + echo "Nothing to test manually" + +.PHONY: test test-man \ No newline at end of file diff --git a/networking/networkManager.go b/networking/networkManager.go new file mode 100644 index 000000000..a4515b603 --- /dev/null +++ b/networking/networkManager.go @@ -0,0 +1,253 @@ +// MIT License +// +// Copyright (c) 2023 Georgiy Lebedev, Amory Hoste and vHive team +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +// Package networking provides primitives to connect function instances to the network. +package networking + +import ( + log "github.com/sirupsen/logrus" + "sync" +) + +// NetworkManager manages the in use network configurations along with a pool of free network configurations +// that can be used to connect a function instance to the network. +type NetworkManager struct { + sync.Mutex + nextID int + hostIfaceName string + + // Pool of free network configs + networkPool []*NetworkConfig + poolCond *sync.Cond + poolSize int + + // Mapping of function instance IDs to their network config + netConfigs map[string]*NetworkConfig + + // Network configs that are being created + inCreation sync.WaitGroup +} + +// NewNetworkManager creates and returns a new network manager that connects function instances to the network +// using the supplied interface. If no interface is supplied, the default interface is used. To take the network +// setup of the critical path of a function creation, the network manager tries to maintain a pool of ready to use +// network configurations of size at least poolSize. +func NewNetworkManager(hostIfaceName string, poolSize int) (*NetworkManager, error) { + manager := new(NetworkManager) + + manager.hostIfaceName = hostIfaceName + if manager.hostIfaceName == "" { + hostIface, err := getHostIfaceName() + if err != nil { + return nil, err + } else { + manager.hostIfaceName = hostIface + } + } + + manager.netConfigs = make(map[string]*NetworkConfig) + manager.networkPool = make([]*NetworkConfig, 0) + + startId, err := getNetworkStartID() + if err == nil { + manager.nextID = startId + } else { + manager.nextID = 0 + } + + manager.poolCond = sync.NewCond(new(sync.Mutex)) + manager.initConfigPool(poolSize) + manager.poolSize = poolSize + + return manager, nil +} + +// initConfigPool fills an empty network pool up to the given poolSize +func (mgr *NetworkManager) initConfigPool(poolSize int) { + var wg sync.WaitGroup + wg.Add(poolSize) + + logger := log.WithFields(log.Fields{"poolSize": poolSize}) + logger.Debug("Initializing network pool") + + // Concurrently create poolSize network configs + for i := 0; i < poolSize; i++ { + go func() { + mgr.addNetConfig() + wg.Done() + }() + } + wg.Wait() +} + +// addNetConfig creates and initializes a new network config +func (mgr *NetworkManager) addNetConfig() { + mgr.Lock() + id := mgr.nextID + mgr.nextID += 1 + mgr.inCreation.Add(1) + mgr.Unlock() + + netCfg := NewNetworkConfig(id, mgr.hostIfaceName) + if err := netCfg.CreateNetwork(); err != nil { + log.Errorf("failed to create network %s:", err) + } + + mgr.poolCond.L.Lock() + mgr.networkPool = append(mgr.networkPool, netCfg) + // Signal in case someone is waiting for a new config to become available in the pool + mgr.poolCond.Signal() + mgr.poolCond.L.Unlock() + mgr.inCreation.Done() +} + +// allocNetConfig allocates a new network config from the pool to a function instance identified by funcID +func (mgr *NetworkManager) allocNetConfig(funcID string) *NetworkConfig { + // Add netconfig to pool to keep pool to configured size + go mgr.addNetConfig() + + logger := log.WithFields(log.Fields{"funcID": funcID}) + logger.Debug("Allocating a new network config from network pool to function instance") + + // Pop a network config from the pool and allocate it to the function instance + mgr.poolCond.L.Lock() + for len(mgr.networkPool) == 0 { + // Wait until a new network config has been created + mgr.poolCond.Wait() + } + + config := mgr.networkPool[len(mgr.networkPool)-1] + mgr.networkPool = mgr.networkPool[:len(mgr.networkPool)-1] + mgr.poolCond.L.Unlock() + + mgr.Lock() + mgr.netConfigs[funcID] = config + mgr.Unlock() + + logger = log.WithFields(log.Fields{ + "funcID": funcID, + "ContainerIP": config.getContainerIP(), + "NamespaceName": config.getNamespaceName(), + "Veth0CIDR": config.getVeth0CIDR(), + "Veth0Name": config.getVeth0Name(), + "Veth1CIDR": config.getVeth1CIDR(), + "Veth1Name": config.getVeth1Name(), + "CloneIP": config.GetCloneIP(), + "ContainerCIDR": config.GetContainerCIDR(), + "GatewayIP": config.GetGatewayIP(), + "HostDevName": config.GetHostDevName(), + "NamespacePath": config.GetNamespacePath()}) + + logger.Debug("Allocated a new network config") + + return config +} + +// releaseNetConfig releases the network config of a given function instance with id funcID back to the pool +func (mgr *NetworkManager) releaseNetConfig(funcID string) { + mgr.Lock() + config := mgr.netConfigs[funcID] + delete(mgr.netConfigs, funcID) + mgr.Unlock() + + logger := log.WithFields(log.Fields{"funcID": funcID}) + logger.Debug("Releasing network config from function instance and adding it to network pool") + + // Add network config back to the pool. We allow the pool to grow over it's configured size here since the + // overhead of keeping a network config in the pool is low compared to the cost of creating a new config. + mgr.poolCond.L.Lock() + mgr.networkPool = append(mgr.networkPool, config) + mgr.poolCond.Signal() + mgr.poolCond.L.Unlock() +} + +// CreateNetwork creates the networking for a function instance identified by funcID +func (mgr *NetworkManager) CreateNetwork(funcID string) (*NetworkConfig, error) { + logger := log.WithFields(log.Fields{"funcID": funcID}) + logger.Debug("Creating network config for function instance") + + netCfg := mgr.allocNetConfig(funcID) + return netCfg, nil +} + +// GetConfig returns the network config assigned to a function instance identified by funcID +func (mgr *NetworkManager) GetConfig(funcID string) *NetworkConfig { + mgr.Lock() + defer mgr.Unlock() + + cfg := mgr.netConfigs[funcID] + return cfg +} + +// RemoveNetwork removes the network config of a function instance identified by funcID. The allocated network devices +// for the given function instance must not be in use anymore when calling this function. +func (mgr *NetworkManager) RemoveNetwork(funcID string) error { + logger := log.WithFields(log.Fields{"funcID": funcID}) + logger.Debug("Removing network config for function instance") + mgr.releaseNetConfig(funcID) + return nil +} + +// Cleanup removes and deallocates all network configurations that are in use or in the network pool. Make sure to first +// clean up all running functions before removing their network configs. +func (mgr *NetworkManager) Cleanup() error { + log.Info("Cleaning up network manager") + mgr.Lock() + defer mgr.Unlock() + + // Wait till all network configs still in creation are added + mgr.inCreation.Wait() + + // Release network configs still in use + var wgu sync.WaitGroup + wgu.Add(len(mgr.netConfigs)) + for funcID := range mgr.netConfigs { + config := mgr.netConfigs[funcID] + go func(config *NetworkConfig) { + if err := config.RemoveNetwork(); err != nil { + log.Errorf("failed to remove network %s:", err) + } + wgu.Done() + }(config) + } + wgu.Wait() + mgr.netConfigs = make(map[string]*NetworkConfig) + + // Cleanup network pool + mgr.poolCond.L.Lock() + var wg sync.WaitGroup + wg.Add(len(mgr.networkPool)) + + for _, config := range mgr.networkPool { + go func(config *NetworkConfig) { + if err := config.RemoveNetwork(); err != nil { + log.Errorf("failed to remove network %s:", err) + } + wg.Done() + }(config) + } + wg.Wait() + mgr.networkPool = make([]*NetworkConfig, 0) + mgr.poolCond.L.Unlock() + + return nil +} diff --git a/networking/networkconfig.go b/networking/networkconfig.go new file mode 100644 index 000000000..a14525df6 --- /dev/null +++ b/networking/networkconfig.go @@ -0,0 +1,292 @@ +// MIT License +// +// Copyright (c) 2023 Georgiy Lebedev, Amory Hoste and vHive team +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +package networking + +import ( + "fmt" + "github.com/pkg/errors" + log "github.com/sirupsen/logrus" + "github.com/vishvananda/netns" + "net" + "runtime" +) + +const ( + defaultContainerCIDR = "172.16.0.2/24" + defaultGatewayCIDR = "172.16.0.1/24" + defaultContainerTap = "tap0" + defaultContainerMac = "AA:FC:00:00:00:01" +) + +// NetworkConfig represents the network devices, IPs, namespaces, routes and filter rules to connect a uVM +// to the network. The network config ID is deterministically mapped to IP addresses to be used for the uVM. +// Note that due to the current allocation of IPs at most 2^14 VMs can be simultaneously be available on a single host. +type NetworkConfig struct { + id int + containerCIDR string // Container IP address (CIDR notation) + gatewayCIDR string // Container gateway IP address (CIDR notation) + containerTap string // Container tap name + containerMac string // Container Mac address + hostIfaceName string // Host network interface name +} + +// NewNetworkConfig creates a new network config with a given id and default host interface +func NewNetworkConfig(id int, hostIfaceName string) *NetworkConfig { + return &NetworkConfig{ + id: id, + containerCIDR: defaultContainerCIDR, + gatewayCIDR: defaultGatewayCIDR, + containerTap: defaultContainerTap, + containerMac: defaultContainerMac, + hostIfaceName: hostIfaceName, + } +} + +// GetMacAddress returns the mac address used for the uVM +func (cfg *NetworkConfig) GetMacAddress() string { + return cfg.containerMac +} + +// GetHostDevName returns the device connecting the uVM to the host +func (cfg *NetworkConfig) GetHostDevName() string { + return cfg.containerTap +} + +// getVeth0Name returns the name for the veth device at the side of the uVM +func (cfg *NetworkConfig) getVeth0Name() string { + return fmt.Sprintf("veth%d-0", cfg.id) +} + +// getVeth0CIDR returns the IP address for the veth device at the side of the uVM in CIDR notation +func (cfg *NetworkConfig) getVeth0CIDR() string { + return fmt.Sprintf("172.17.%d.%d/30", (4*cfg.id)/256, ((4*cfg.id)+2)%256) +} + +// getVeth1Name returns the name for the veth device at the side of the host +func (cfg *NetworkConfig) getVeth1Name() string { + return fmt.Sprintf("veth%d-1", cfg.id) +} + +// getVeth1Name returns the IP address for the veth device at the side of the host in CIDR notation +func (cfg *NetworkConfig) getVeth1CIDR() string { + return fmt.Sprintf("172.17.%d.%d/30", (4*cfg.id)/256, ((4*cfg.id)+1)%256) +} + +// GetCloneIP returns the IP address the uVM is reachable at from the host +func (cfg *NetworkConfig) GetCloneIP() string { + return fmt.Sprintf("172.18.%d.%d", cfg.id/254, 1+(cfg.id%254)) +} + +// GetContainerCIDR returns the internal IP of the uVM in CIDR notation +func (cfg *NetworkConfig) GetContainerCIDR() string { + return cfg.containerCIDR +} + +// getNamespaceName returns the network namespace name for the uVM +func (cfg *NetworkConfig) getNamespaceName() string { + return fmt.Sprintf("uvmns%d", cfg.id) +} + +// GetNamespacePath returns the full path to the network namespace for the uVM +func (cfg *NetworkConfig) GetNamespacePath() string { + return fmt.Sprintf("/var/run/netns/%s", cfg.getNamespaceName()) +} + +// getContainerIP returns the internal IP of the uVM +func (cfg *NetworkConfig) getContainerIP() string { + ip, _, _ := net.ParseCIDR(cfg.containerCIDR) + return ip.String() +} + +// GetGatewayIP returns the IP address of the tap device associated with the uVM +func (cfg *NetworkConfig) GetGatewayIP() string { + ip, _, _ := net.ParseCIDR(cfg.gatewayCIDR) + return ip.String() +} + +// createVmNetwork creates network devices, namespaces, routes and filter rules for the uVM at the +// uVM side +func (cfg *NetworkConfig) createVmNetwork(hostNsHandle netns.NsHandle) error { + // A. In uVM netns + // A.1. Create network namespace for uVM & join network namespace + vmNsHandle, err := netns.NewNamed(cfg.getNamespaceName()) // Switches namespace + if err != nil { + log.Println(err) + return err + } + defer func() { _ = vmNsHandle.Close() }() + + // A.2. Create tap device for uVM + if err := createTap(cfg.containerTap, cfg.gatewayCIDR, cfg.getNamespaceName()); err != nil { + return err + } + + // A.3. Create veth pair for uVM + // A.3.1 Create veth pair + if err := createVethPair(cfg.getVeth0Name(), cfg.getVeth1Name(), vmNsHandle, hostNsHandle); err != nil { + return err + } + + // A.3.2 Configure uVM side veth pair + if err := configVeth(cfg.getVeth0Name(), cfg.getVeth0CIDR()); err != nil { + return err + } + + // A.3.3 Designate host side as default gateway for packets leaving namespace + if err := setDefaultGateway(cfg.getVeth1CIDR()); err != nil { + return err + } + + // A.4. Setup NAT rules + if err := setupNatRules(cfg.getVeth0Name(), cfg.getContainerIP(), cfg.GetCloneIP(), vmNsHandle); err != nil { + return err + } + + return nil +} + +// createHostNetwork creates network devices, namespaces, routes and filter rules for the uVM at the host +// side +func (cfg *NetworkConfig) createHostNetwork() error { + // B. In host netns + // B.1 Configure host side veth pair + if err := configVeth(cfg.getVeth1Name(), cfg.getVeth1CIDR()); err != nil { + return err + } + + // B.2 Add a route on the host for the clone address + if err := addRoute(cfg.GetCloneIP(), cfg.getVeth0CIDR()); err != nil { + return err + } + + // B.3 Setup nat to route traffic out of veth device + if err := setupForwardRules(cfg.getVeth1Name(), cfg.hostIfaceName); err != nil { + return err + } + return nil +} + +// CreateNetwork creates the necessary network devices, namespaces, routes and filter rules to connect the uVM to the +// network. The networking is created as described in the Firecracker documentation on providing networking for clones +// (https://github.com/firecracker-microvm/firecracker/blob/main/docs/snapshotting/network-for-clones.md) +func (cfg *NetworkConfig) CreateNetwork() error { + // 1. Lock the OS Thread so we don't accidentally switch namespaces + runtime.LockOSThread() + + // 2. Get host network namespace + hostNsHandle, err := netns.Get() + defer func() { _ = hostNsHandle.Close() }() + if err != nil { + log.Printf("Failed to get host ns, %s\n", err) + return err + } + + // 3. Setup networking in instance namespace + if err := cfg.createVmNetwork(hostNsHandle); err != nil { + _ = netns.Set(hostNsHandle) + runtime.UnlockOSThread() + return err + } + + // 4. Go back to host namespace + err = netns.Set(hostNsHandle) + if err != nil { + return err + } + + runtime.UnlockOSThread() + + // 5. Setup networking in host namespace + if err := cfg.createHostNetwork(); err != nil { + return err + } + + return nil +} + +// CreateNetwork removes the necessary network devices, namespaces, routes and filter rules to connect the +// function instance to the network +func (cfg *NetworkConfig) RemoveNetwork() error { + // Delete nat to route traffic out of veth device + if err := deleteForwardRules(cfg.getVeth1Name()); err != nil { + return err + } + + // Delete route on the host for the clone address + if err := deleteRoute(cfg.GetCloneIP(), cfg.getVeth0CIDR()); err != nil { + return err + } + + runtime.LockOSThread() + + hostNsHandle, err := netns.Get() + defer func() { _ = hostNsHandle.Close() }() + if err != nil { + log.Printf("Failed to get host ns, %s\n", err) + return err + } + + // Get uVM namespace handle + vmNsHandle, err := netns.GetFromName(cfg.getNamespaceName()) + defer func() { _ = vmNsHandle.Close() }() + if err != nil { + return err + } + err = netns.Set(vmNsHandle) + if err != nil { + return err + } + + // Delete NAT rules + if err := deleteNatRules(vmNsHandle); err != nil { + return err + } + + // Delete default gateway for packets leaving namespace + if err := deleteDefaultGateway(cfg.getVeth1CIDR()); err != nil { + return err + } + + // Delete uVM side veth pair + if err := deleteVethPair(cfg.getVeth0Name(), cfg.getVeth1Name(), vmNsHandle, hostNsHandle); err != nil { + return err + } + + // Delete tap device for uVM + if err := deleteTap(cfg.containerTap); err != nil { + return err + } + + // Delete namespace + if err := netns.DeleteNamed(cfg.getNamespaceName()); err != nil { + return errors.Wrapf(err, "deleting network namespace") + } + + err = netns.Set(hostNsHandle) + if err != nil { + return err + } + runtime.UnlockOSThread() + + return nil +} diff --git a/networking/networking.go b/networking/networking.go new file mode 100644 index 000000000..c7650fd59 --- /dev/null +++ b/networking/networking.go @@ -0,0 +1,515 @@ +// MIT License +// +// Copyright (c) 2023 Georgiy Lebedev, Amory Hoste and vHive team +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +package networking + +import ( + "bufio" + "bytes" + "fmt" + "github.com/google/nftables" + "github.com/google/nftables/expr" + "github.com/pkg/errors" + log "github.com/sirupsen/logrus" + "github.com/vishvananda/netlink" + "github.com/vishvananda/netns" + "golang.org/x/sys/unix" + "net" + "os" + "os/exec" + "regexp" + "strconv" + "strings" +) + +// getHostIfaceName returns the default host network interface name. +func getHostIfaceName() (string, error) { + out, err := exec.Command( + "route", + ).Output() + if err != nil { + log.Warnf("Failed to fetch host net interfaces %v\n%s\n", err, out) + return "", err + } + + scanner := bufio.NewScanner(bytes.NewReader(out)) + for scanner.Scan() { + line := scanner.Text() + if strings.Contains(line, "default") { + return line[strings.LastIndex(line, " ")+1:], nil + } + } + return "", errors.New("Failed to fetch host net interface") +} + +// createTap creates a TAP device with name tapName, IP gatewayIP in the network namespace with name netnsName +func createTap(tapName, gatewayIP, netnsName string) error { + // 1. Create tap device + + logger := log.WithFields(log.Fields{"tap": tapName, "IP gateway": gatewayIP, "namespace": netnsName}) + + la := netlink.NewLinkAttrs() + la.Name = tapName + la.Namespace = netnsName + + logger.Debug("Creating tap for virtual network") + + tap0 := &netlink.Tuntap{LinkAttrs: la, Mode: netlink.TUNTAP_MODE_TAP} + if err := netlink.LinkAdd(tap0); err != nil { + return errors.Wrapf(err, "creating tap") + } + + // 2. Give tap device ip address + addr, _ := netlink.ParseAddr(gatewayIP) + addr.Broadcast = net.IPv4(0, 0, 0, 0) + if err := netlink.AddrAdd(tap0, addr); err != nil { + return errors.Wrapf(err, "adding tap ip address") + } + + // 3. Enable tap network interface + if err := netlink.LinkSetUp(tap0); err != nil { + return errors.Wrapf(err, "enabling tap") + } + + return nil +} + +// deleteTap deletes the tap device identified by name tapName +func deleteTap(tapName string) error { + logger := log.WithFields(log.Fields{"tap": tapName}) + logger.Debug("Removing tap") + if err := netlink.LinkDel(&netlink.Tuntap{LinkAttrs: netlink.LinkAttrs{Name: tapName}}); err != nil { + return errors.Wrapf(err, "deleting tap %s", tapName) + } + + return nil +} + +// createVethPair creates a virtual ethernet pair connecting the supplied namespaces +func createVethPair(veth0Name, veth1Name string, veth0NsHandle, veth1NsHandle netns.NsHandle) error { + veth := &netlink.Veth{LinkAttrs: netlink.LinkAttrs{Name: veth0Name, Namespace: netlink.NsFd(veth0NsHandle), TxQLen: 1000}, PeerName: veth1Name, PeerNamespace: netlink.NsFd(veth1NsHandle)} + if err := netlink.LinkAdd(veth); err != nil { + return errors.Wrapf(err, "creating veth pair") + } + + return nil +} + +// deleteVethPair deletes the virtual ethernet pair connecting the supplied namespaces +func deleteVethPair(veth0Name, veth1Name string, veth0NsHandle, veth1NsHandle netns.NsHandle) error { + if err := netlink.LinkDel(&netlink.Veth{LinkAttrs: netlink.LinkAttrs{Name: veth0Name, Namespace: netlink.NsFd(veth0NsHandle)}, PeerName: veth1Name, PeerNamespace: netlink.NsFd(veth1NsHandle)}); err != nil { + return errors.Wrapf(err, "deleting veth %s", veth0Name) + } + return nil +} + +// configVeth configures the IP address of a veth device and enables the device +func configVeth(linkName, vethIp string) error { + // 1. Get link + veth, err := netlink.LinkByName(linkName) + if err != nil { + return errors.Wrapf(err, "Finding veth link") + } + + // 2. Set IP address + addr, _ := netlink.ParseAddr(vethIp) + addr.Broadcast = net.IPv4(0, 0, 0, 0) + if err := netlink.AddrAdd(veth, addr); err != nil { + return errors.Wrapf(err, "adding veth link ip address") + } + + // 3. Enable link + if err := netlink.LinkSetUp(veth); err != nil { + return errors.Wrapf(err, "enabling veth link") + } + + return nil +} + +// setDefaultGateway creates a default routing rule to the supplied gatewayIP +func setDefaultGateway(gatewayIp string) error { + gw, _, err := net.ParseCIDR(gatewayIp) + if err != nil { + return errors.Wrapf(err, "parsing ip") + } + + defaultRoute := &netlink.Route{ + Dst: nil, + Gw: gw, + } + + if err := netlink.RouteAdd(defaultRoute); err != nil { + return errors.Wrapf(err, "adding default route") + } + + return nil +} + +// deleteDefaultGateway deletes the default routing rule to the supplied gatewayIP +func deleteDefaultGateway(gatewayIp string) error { + gw, _, err := net.ParseCIDR(gatewayIp) + if err != nil { + return errors.Wrapf(err, "parsing ip") + } + + defaultRoute := &netlink.Route{ + Dst: nil, + Gw: gw, + } + + if err := netlink.RouteDel(defaultRoute); err != nil { + return errors.Wrapf(err, "deleting default route") + } + + return nil +} + +// setupNatRules configures the NAT rules. Each uVMs address is translated to an external clone address to avoid +// conflicts (see https://github.com/firecracker-microvm/firecracker/blob/main/docs/snapshotting/network-for-clones.md) +func setupNatRules(vethVmName, hostIp, cloneIp string, vmNsHandle netns.NsHandle) error { + conn := nftables.Conn{NetNS: int(vmNsHandle)} + + // 1. add table ip nat + natTable := &nftables.Table{ + Name: "nat", + Family: nftables.TableFamilyIPv4, + } + + // 2. Iptables: -t nat -A POSTROUTING -o veth1-0 -s 172.16.0.2 -j SNAT --to 192.168.0.1 + // 2.1 add chain ip nat POSTROUTING { type nat hook postrouting priority 0; policy accept; } + polAccept := nftables.ChainPolicyAccept + postRouteCh := &nftables.Chain{ + Name: "POSTROUTING", + Table: natTable, + Type: nftables.ChainTypeNAT, + Priority: 0, + Hooknum: nftables.ChainHookPostrouting, + Policy: &polAccept, + } + + // 2.2 add rule ip nat POSTROUTING oifname veth1-0 ip saddr 172.16.0.2 counter snat to 192.168.0.1 + snatRule := &nftables.Rule{ + Table: natTable, + Chain: postRouteCh, + Exprs: []expr.Any{ + // Load iffname in register 1 + &expr.Meta{Key: expr.MetaKeyOIFNAME, Register: 1}, + // Check iifname == veth1-0 + &expr.Cmp{ + Op: expr.CmpOpEq, + Register: 1, + Data: []byte(fmt.Sprintf("%s\x00", vethVmName)), + }, + // Load source IP address (offset 12 bytes network header) in register 1 + &expr.Payload{ + DestRegister: 1, + Base: expr.PayloadBaseNetworkHeader, + Offset: 12, + Len: 4, + }, + // Check source ip address == 172.16.0.2 + &expr.Cmp{ + Op: expr.CmpOpEq, + Register: 1, + Data: net.ParseIP(hostIp).To4(), + }, + // Load snatted address (192.168.0.1) in register 1 + &expr.Immediate{ + Register: 1, + Data: net.ParseIP(cloneIp).To4(), + }, + &expr.NAT{ + Type: expr.NATTypeSourceNAT, // Snat + Family: unix.NFPROTO_IPV4, + RegAddrMin: 1, + }, + }, + } + + // 3. Iptables: -t nat -A PREROUTING -i veth1-0 -d 192.168.0.1 -j DNAT --to 172.16.0.2 + // 3.1 add chain ip nat PREROUTING { type nat hook prerouting priority 0; policy accept; } + preRouteCh := &nftables.Chain{ + Name: "PREROUTING", + Table: natTable, + Type: nftables.ChainTypeNAT, + Priority: 0, + Hooknum: nftables.ChainHookPrerouting, + Policy: &polAccept, + } + + // 3.2 add rule ip nat PREROUTING iifname veth1-0 ip daddr 192.168.0.1 counter dnat to 172.16.0.2 + dnatRule := &nftables.Rule{ + Table: natTable, + Chain: preRouteCh, + Exprs: []expr.Any{ + // Load iffname in register 1 + &expr.Meta{Key: expr.MetaKeyIIFNAME, Register: 1}, + // Check iifname == veth1-0 + &expr.Cmp{ + Op: expr.CmpOpEq, + Register: 1, + Data: []byte(fmt.Sprintf("%s\x00", vethVmName)), + }, + // Load destination IP address (offset 16 bytes network header) in register 1 + &expr.Payload{ + DestRegister: 1, + Base: expr.PayloadBaseNetworkHeader, + Offset: 16, + Len: 4, + }, + // Check destination ip address == 192.168.0.1 + &expr.Cmp{ + Op: expr.CmpOpEq, + Register: 1, + Data: net.ParseIP(cloneIp).To4(), + }, + // Load dnatted address (172.16.0.2) in register 1 + &expr.Immediate{ + Register: 1, + Data: net.ParseIP(hostIp).To4(), + }, + &expr.NAT{ + Type: expr.NATTypeDestNAT, // Dnat + Family: unix.NFPROTO_IPV4, + RegAddrMin: 1, + }, + }, + } + + // Apply rules + conn.AddTable(natTable) + conn.AddChain(postRouteCh) + conn.AddRule(snatRule) + conn.AddChain(preRouteCh) + conn.AddRule(dnatRule) + if err := conn.Flush(); err != nil { + return errors.Wrapf(err, "creating nat rules") + } + return nil +} + +// deleteNatRules deletes the NAT rules to give each uVM a clone address. +func deleteNatRules(vmNsHandle netns.NsHandle) error { + conn := nftables.Conn{NetNS: int(vmNsHandle)} + + natTable := &nftables.Table{ + Name: "nat", + Family: nftables.TableFamilyIPv4, + } + + // Apply + conn.DelTable(natTable) + if err := conn.Flush(); err != nil { + return errors.Wrapf(err, "deleting nat rules") + } + return nil +} + +// setupForwardRules creates forwarding rules to allow traffic from the end of the veth pair to the default host interface. +func setupForwardRules(vethHostName, hostIface string) error { + conn := nftables.Conn{} + + // 1. add table ip filter + filterTable := &nftables.Table{ + Name: "filter", + Family: nftables.TableFamilyIPv4, + } + + // 2. add chain ip filter FORWARD { type filter hook forward priority 0; policy accept; } + polAccept := nftables.ChainPolicyAccept + fwdCh := &nftables.Chain{ + Name: fmt.Sprintf("FORWARD%s", vethHostName), + Table: filterTable, + Type: nftables.ChainTypeFilter, + Priority: 0, + Hooknum: nftables.ChainHookForward, + Policy: &polAccept, + } + + // 3. Iptables: -A FORWARD -i veth1-1 -o eno49 -j ACCEPT + // 3.1 add rule ip filter FORWARD iifname veth1-1 oifname eno49 counter accept + outRule := &nftables.Rule{ + Table: filterTable, + Chain: fwdCh, + Exprs: []expr.Any{ + // Load iffname in register 1 + &expr.Meta{Key: expr.MetaKeyIIFNAME, Register: 1}, + // Check iifname == veth1-0 + &expr.Cmp{ + Op: expr.CmpOpEq, + Register: 1, + Data: []byte(fmt.Sprintf("%s\x00", vethHostName)), + }, + // Load oif in register 1 + &expr.Meta{Key: expr.MetaKeyOIFNAME, Register: 1}, + // Check iifname == veth1-0 + &expr.Cmp{ + Op: expr.CmpOpEq, + Register: 1, + Data: []byte(fmt.Sprintf("%s\x00", hostIface)), + }, + &expr.Verdict{ + Kind: expr.VerdictAccept, + }, + }, + } + + // 4. Iptables: -A FORWARD -o veth1-1 -i eno49 -j ACCEPT + // 4.1 add rule ip filter FORWARD iifname eno49 oifname veth1-1 counter accept + inRule := &nftables.Rule{ + Table: filterTable, + Chain: fwdCh, + Exprs: []expr.Any{ + // Load oifname in register 1 + &expr.Meta{Key: expr.MetaKeyOIFNAME, Register: 1}, + // Check iifname == veth1-0 + &expr.Cmp{ + Op: expr.CmpOpEq, + Register: 1, + Data: []byte(fmt.Sprintf("%s\x00", vethHostName)), + }, + // Load oif in register 1 + &expr.Meta{Key: expr.MetaKeyIIFNAME, Register: 1}, + // Check iifname == veth1-0 + &expr.Cmp{ + Op: expr.CmpOpEq, + Register: 1, + Data: []byte(fmt.Sprintf("%s\x00", hostIface)), + }, + &expr.Verdict{ + Kind: expr.VerdictAccept, + }, + }, + } + conn.AddTable(filterTable) + conn.AddChain(fwdCh) + conn.AddRule(outRule) + conn.AddRule(inRule) + + if err := conn.Flush(); err != nil { + return errors.Wrapf(err, "creating forward rules") + } + return nil +} + +// deleteNatRules deletes the forward rules to allow traffic to the default host interface. +func deleteForwardRules(vethHostName string) error { + conn := nftables.Conn{} + + // 1. add table ip filter + filterTable := &nftables.Table{ + Name: "filter", + Family: nftables.TableFamilyIPv4, + } + + // 2. add chain ip filter FORWARD { type filter hook forward priority 0; policy accept; } + polAccept := nftables.ChainPolicyAccept + fwdCh := &nftables.Chain{ + Name: fmt.Sprintf("FORWARD%s", vethHostName), + Table: filterTable, + Type: nftables.ChainTypeFilter, + Priority: 0, + Hooknum: nftables.ChainHookForward, + Policy: &polAccept, + } + + // Apply + conn.FlushChain(fwdCh) + conn.DelChain(fwdCh) + if err := conn.Flush(); err != nil { + return errors.Wrapf(err, "deleting forward rules") + } + return nil +} + +// addRoute adds a routing table entry to destIp with gateway gatewayIp. +func addRoute(destIp, gatewayIp string) error { + _, dstNet, err := net.ParseCIDR(fmt.Sprintf("%s/32", destIp)) + if err != nil { + return errors.Wrapf(err, "parsing route destination ip") + } + + gwAddr, _, err := net.ParseCIDR(gatewayIp) + if err != nil { + return errors.Wrapf(err, "parsing route gateway ip") + } + + route := &netlink.Route{ + Dst: dstNet, + Gw: gwAddr, + } + + if err := netlink.RouteAdd(route); err != nil { + return errors.Wrapf(err, "adding route") + } + return nil +} + +// addRoute deletes the routing table entry to destIp with gateway gatewayIp. +func deleteRoute(destIp, gatewayIp string) error { + _, dstNet, err := net.ParseCIDR(fmt.Sprintf("%s/32", destIp)) + if err != nil { + return errors.Wrapf(err, "parsing route destination ip") + } + + gwAddr, _, err := net.ParseCIDR(gatewayIp) + if err != nil { + return errors.Wrapf(err, "parsing route gateway ip") + } + + route := &netlink.Route{ + Dst: dstNet, + Gw: gwAddr, + } + + if err := netlink.RouteDel(route); err != nil { + return errors.Wrapf(err, "deleting route") + } + return nil +} + +// getNetworkStartID fetches the +func getNetworkStartID() (int, error) { + entries, err := os.ReadDir("/run/netns") + if err != nil { + return 0, errors.Wrapf(err, "Couldn't read network namespace dir") + } + + maxId := 0 + for _, entry := range entries { + if !entry.IsDir() { + netnsName := entry.Name() + + re := regexp.MustCompile(`^uvmns([0-9]+)$`) + regres := re.FindStringSubmatch(netnsName) + + if len(regres) > 1 { + id, err := strconv.Atoi(regres[1]) + if err == nil && id > maxId { + maxId = id + } + } + } + } + + return maxId + 1, nil +} diff --git a/networking/networking_test.go b/networking/networking_test.go new file mode 100644 index 000000000..9725136a1 --- /dev/null +++ b/networking/networking_test.go @@ -0,0 +1,110 @@ +// MIT License +// +// Copyright (c) 2023 Georgiy Lebedev, Plamen Petrov, Amory Hoste and vHive team +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +package networking + +import ( + "fmt" + "os" + "sync" + "testing" + + ctrdlog "github.com/containerd/containerd/log" + log "github.com/sirupsen/logrus" + "github.com/stretchr/testify/require" +) + +func TestMain(m *testing.M) { + // call flag.Parse() here if TestMain uses flags + + log.SetFormatter(&log.TextFormatter{ + TimestampFormat: ctrdlog.RFC3339NanoFixed, + FullTimestamp: true, + }) + //log.SetReportCaller(true) // FIXME: make sure it's false unless debugging + + log.SetOutput(os.Stdout) + + log.SetLevel(log.InfoLevel) + + os.Exit(m.Run()) +} + +func TestCreateCleanManager(t *testing.T) { + poolSize := []int{1, 5, 20} + + for _, n := range poolSize { + mgr, createErr := NewNetworkManager("", n) + require.NoError(t, createErr, "Network manager creation returned error") + + cleanErr := mgr.Cleanup() + require.NoError(t, cleanErr, "Network manager cleanup returned error") + } +} + +func TestCreateRemoveNetworkParallel(t *testing.T) { + netNum := []int{50, 200} + + mgr, err := NewNetworkManager("", 10) + require.NoError(t, err, "Network manager creation returned error") + defer func() { _ = mgr.Cleanup() }() + + for _, n := range netNum { + var wg sync.WaitGroup + for i := 0; i < n; i++ { + wg.Add(1) + go func(i int) { + defer wg.Done() + _, err := mgr.CreateNetwork(fmt.Sprintf("func_%d", i)) + require.NoError(t, err, fmt.Sprintf("Failed to create network for func_%d", i)) + }(i) + } + wg.Wait() + for i := 0; i < n; i++ { + wg.Add(1) + go func(i int) { + defer wg.Done() + err := mgr.RemoveNetwork(fmt.Sprintf("func_%d", i)) + require.NoError(t, err, fmt.Sprintf("Failed to remove network for func_%d", i)) + }(i) + } + wg.Wait() + } +} + +func TestCreateRemoveNetworkSerial(t *testing.T) { + netNum := 50 + + mgr, err := NewNetworkManager("", 50) + require.NoError(t, err, "Network manager creation returned error") + defer func() { _ = mgr.Cleanup() }() + + for i := 0; i < netNum; i++ { + _, err = mgr.CreateNetwork(fmt.Sprintf("func_%d", i)) + require.NoError(t, err, "Failed to create network") + } + + for i := 0; i < netNum; i++ { + err = mgr.RemoveNetwork(fmt.Sprintf("func_%d", i)) + require.NoError(t, err, "Failed to remove network") + } +}