Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion pkg/composer/serviceparser/serviceparser.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package serviceparser
import (
"bytes"
"encoding/csv"
"encoding/json"
"errors"
"fmt"
"os"
Expand All @@ -34,6 +35,7 @@ import (

"github.com/containerd/nerdctl/v2/pkg/healthcheck"
"github.com/containerd/nerdctl/v2/pkg/identifiers"
"github.com/containerd/nerdctl/v2/pkg/labels"
"github.com/containerd/nerdctl/v2/pkg/reflectutil"
)

Expand Down Expand Up @@ -612,20 +614,36 @@ func newContainer(project *types.Project, parsed *Service, i int) (*Container, e
return nil, err
}
netTypeContainer := false
// Collect per-network static IPs to determine if we need a per-network IP map.
networkIPMap := make(map[string]string)
for _, net := range networks {
if strings.HasPrefix(net.fullName, "container:") {
netTypeContainer = true
}
c.RunArgs = append(c.RunArgs, "--net="+net.fullName)
if value, ok := svc.Networks[net.shortNetworkName]; ok {
if value != nil && value.Ipv4Address != "" {
c.RunArgs = append(c.RunArgs, "--ip="+value.Ipv4Address)
networkIPMap[net.fullName] = value.Ipv4Address
}
if value != nil && value.MacAddress != "" {
c.RunArgs = append(c.RunArgs, "--mac-address="+value.MacAddress)
}
}
}
// When multiple networks have static IPs, pass a per-network IP map as an annotation
// so that each CNI plugin receives only the IP for its own network.
// For a single IP, use the legacy --ip= flag for backward compatibility.
if len(networkIPMap) > 1 {
ipMapJSON, err := json.Marshal(networkIPMap)
if err != nil {
return nil, fmt.Errorf("failed to marshal per-network IP map: %w", err)
}
c.RunArgs = append(c.RunArgs, fmt.Sprintf("--annotation=%s=%s", labels.IPAddressPerNetwork, string(ipMapJSON)))
} else if len(networkIPMap) == 1 {
for _, ip := range networkIPMap {
c.RunArgs = append(c.RunArgs, "--ip="+ip)
}
}

if netTypeContainer && svc.Hostname != "" {
return nil, fmt.Errorf("conflicting options: hostname and container network mode")
Expand Down
4 changes: 4 additions & 0 deletions pkg/labels/labels.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ const (
// IP6Address is the static IP6 address of the container assigned by the user
IP6Address = Prefix + "ip6"

// IPAddressPerNetwork JSON-encoded map of network names to user-assigned static
// IPv4 addresses. Used for multi-network containers.
IPAddressPerNetwork = Prefix + "ip-per-network"

// LogURI is the log URI
LogURI = Prefix + "log-uri"

Expand Down
241 changes: 199 additions & 42 deletions pkg/ocihook/ocihook.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"strings"
"time"

cnilibrary "github.com/containernetworking/cni/libcni"
types100 "github.com/containernetworking/cni/pkg/types/100"
"github.com/opencontainers/runtime-spec/specs-go"
b4nndclient "github.com/rootless-containers/bypass4netns/pkg/api/daemon/client"
Expand Down Expand Up @@ -185,13 +186,15 @@ func newHandlerOpts(state *specs.State, dataStore, cniPath, cniNetconfPath, brid
cniOpts := []cni.Opt{
cni.WithPluginDir([]string{cniPath}),
}
o.cniPluginDir = cniPath
var netw *netutil.NetworkConfig
for _, netstr := range networks {
if netw, err = e.NetworkByNameOrID(netstr); err != nil {
return nil, err
}
cniOpts = append(cniOpts, cni.WithConfListBytes(netw.Bytes))
o.cniNames = append(o.cniNames, netstr)
o.cniNetConfigs = append(o.cniNetConfigs, netw.Bytes)
}
o.cni, err = cni.New(cniOpts...)
if err != nil {
Expand Down Expand Up @@ -228,6 +231,15 @@ func newHandlerOpts(state *specs.State, dataStore, cniPath, cniNetconfPath, brid
o.containerIP6 = ip6Address
}

// Parse per-network IP map if present (for multi-network containers with per-network static IPs)
if ipPerNetJSON, ok := o.state.Annotations[labels.IPAddressPerNetwork]; ok && ipPerNetJSON != "" {
var ipPerNetwork map[string]string
if err := json.Unmarshal([]byte(ipPerNetJSON), &ipPerNetwork); err != nil {
return nil, fmt.Errorf("failed to unmarshal per-network IP map: %w", err)
}
o.ipPerNetwork = ipPerNetwork
}

if rootlessutil.IsRootlessChild() {
o.rootlessKitClient, err = rootlessutil.NewRootlessKitClient()
if err != nil {
Expand Down Expand Up @@ -258,13 +270,16 @@ type handlerOpts struct {
ports []cni.PortMapping
cni cni.CNI
cniNames []string
cniPluginDir string
cniNetConfigs [][]byte
fullID string
rootlessKitClient rlkclient.Client
bypassClient b4nndclient.Client
extraHosts map[string]string // host:ip
containerIP string
containerMAC string
containerIP6 string
ipPerNetwork map[string]string
}

// hookSpec is from https://github.com/containerd/containerd/blob/v1.4.3/cmd/containerd/command/oci-hook.go#L59-L64
Expand Down Expand Up @@ -476,6 +491,72 @@ func CleanupPortReserverProcess(namespace, id string) error {
return nil
}

// perNetworkIfName returns the container-side interface name for a given network index
// (e.g., "eth0", "eth1", "eth2").
func perNetworkIfName(index int) string {
return fmt.Sprintf("eth%d", index)
}

// perNetworkAdd calls cnilibrary.AddNetworkList directly for a single network
// with the correct interface name (ethN) and per-network args.
func perNetworkAdd(ctx context.Context, opts *handlerOpts, networkIndex int, nsPath string, extraArgs [][2]string, portMappings []cni.PortMapping) (*types100.Result, error) {
if networkIndex < 0 || networkIndex >= len(opts.cniNetConfigs) {
return nil, fmt.Errorf("network index %d out of range (have %d networks)", networkIndex, len(opts.cniNetConfigs))
}
confList, err := cnilibrary.ConfListFromBytes(opts.cniNetConfigs[networkIndex])
if err != nil {
return nil, fmt.Errorf("failed to parse conflist for network %d: %w", networkIndex, err)
}
cniConfig := cnilibrary.NewCNIConfig([]string{opts.cniPluginDir}, nil)
rt := &cnilibrary.RuntimeConf{
ContainerID: opts.fullID,
NetNS: nsPath,
IfName: perNetworkIfName(networkIndex),
Args: extraArgs,
CapabilityArgs: make(map[string]interface{}),
}
if len(portMappings) > 0 {
rt.CapabilityArgs["portMappings"] = portMappings
}
// Add ips capability for static IPv6 address allocation per CNI spec:
// https://www.cni.dev/docs/conventions/#well-known-capabilities
for _, arg := range extraArgs {
if arg[0] == "IP" && strings.Contains(arg[1], ":") {
rt.CapabilityArgs["ips"] = []string{arg[1]}
break
}
}
result, err := cniConfig.AddNetworkList(ctx, confList, rt)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the new method, since AddNetworkList is called instead of Setup, the ips capability (for IPv6 static addresses) is not added to rt.CapabilityArgs, so it seems necessary to add them.

ips | Dynamically allocate IPs for container interface. Runtime which has the ability of address allocation can pass these to plugins. | ips | A list of IP (string entries). [ “10.10.0.1/24”, “3ffe:ffff:0:01ff::1/64” ] | none | CNI static plugin

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the feedback. Addressed this in commit 7255bf1

if err != nil {
return nil, err
}
return types100.NewResultFromResult(result)
}

// perNetworkDel calls cnilibrary.DelNetworkList directly for a single network
// with the correct interface name (ethN).
func perNetworkDel(ctx context.Context, opts *handlerOpts, networkIndex int, nsPath string, extraArgs [][2]string, portMappings []cni.PortMapping) error {
if networkIndex < 0 || networkIndex >= len(opts.cniNetConfigs) {
return fmt.Errorf("network index %d out of range (have %d networks)", networkIndex, len(opts.cniNetConfigs))
}
confList, err := cnilibrary.ConfListFromBytes(opts.cniNetConfigs[networkIndex])
if err != nil {
return fmt.Errorf("failed to parse conflist for network %d: %w", networkIndex, err)
}
cniConfig := cnilibrary.NewCNIConfig([]string{opts.cniPluginDir}, nil)
rt := &cnilibrary.RuntimeConf{
ContainerID: opts.fullID,
NetNS: nsPath,
IfName: perNetworkIfName(networkIndex),
Args: extraArgs,
CapabilityArgs: make(map[string]interface{}),
}
if len(portMappings) > 0 {
rt.CapabilityArgs["portMappings"] = portMappings
}
return cniConfig.DelNetworkList(ctx, confList, rt)
}

func applyNetworkSettings(opts *handlerOpts) (err error) {
portMapOpts, err := getPortMapOpts(opts)
if err != nil {
Expand Down Expand Up @@ -547,17 +628,7 @@ func applyNetworkSettings(opts *handlerOpts) (err error) {
if err != nil {
return err
}
var namespaceOpts []cni.NamespaceOpts
namespaceOpts = append(namespaceOpts, portMapOpts...)
namespaceOpts = append(namespaceOpts, ipAddressOpts...)
namespaceOpts = append(namespaceOpts, macAddressOpts...)
namespaceOpts = append(namespaceOpts, ip6AddressOpts...)
namespaceOpts = append(namespaceOpts,
cni.WithLabels(map[string]string{
"IgnoreUnknown": "1",
}),
cni.WithArgs("NERDCTL_CNI_DHCP_HOSTNAME", opts.state.Annotations[labels.Hostname]),
)

hsMeta := hostsstore.Meta{
ID: opts.state.ID,
Networks: make(map[string]*types100.Result, len(opts.cniNames)),
Expand All @@ -567,33 +638,98 @@ func applyNetworkSettings(opts *handlerOpts) (err error) {
Name: opts.state.Annotations[labels.Name],
}

// When containerd gets bounced, containers that were previously running and that are restarted will go again
// through onCreateRuntime (*unlike* in a normal stop/start flow).
// As such, a container may very well have an ip already. The bridge plugin would thus refuse to loan a new one
// and error out, thus making the onCreateRuntime hook fail. In turn, runc (or containerd) will mis-interpret this,
// and subsequently call onPostStop (although the container will not get deleted), and we will release the name...
// leading to a bricked system where multiple containers may share the same name.
// Thus, we do pre-emptively clean things up - error is not checked, as in the majority of cases, that would
// legitimately error (and that does not matter)
// See https://github.com/containerd/nerdctl/issues/3355
_ = opts.cni.Remove(ctx, opts.fullID, "", namespaceOpts...)
// When per-network IPs are specified (multi-network with different static IPs),
// we must set up each network individually so each CNI plugin receives only its own IP.
// We use cnilibrary directly (instead of go-cni's Setup) so that each network
// gets the correct interface name (eth0, eth1, eth2, ...) rather than all getting eth0.
if len(opts.ipPerNetwork) > 0 {
// Pre-emptively clean up (see comment below for rationale)
for i := range opts.cniNames {
_ = perNetworkDel(ctx, opts, i, "", nil, nil)
}

// Defer CNI configuration removal to ensure idempotency of oci-hook.
defer func() {
if err != nil {
log.L.Warn("Container failed starting. Removing allocated network configuration.")
_ = opts.cni.Remove(ctx, opts.fullID, nsPath, namespaceOpts...)
defer func() {
if err != nil {
log.L.Warn("Container failed starting. Removing allocated network configuration.")
for i, cniName := range opts.cniNames {
if delErr := perNetworkDel(ctx, opts, i, nsPath, nil, nil); delErr != nil {
log.L.WithError(delErr).Warnf("failed to remove network %s during cleanup", cniName)
}
}
}
}()

// Convert port mappings for cnilibrary RuntimeConf capability args
var capPortMappings []cni.PortMapping
if len(opts.ports) > 0 {
capPortMappings = opts.ports
}
}()

cniRes, err := opts.cni.Setup(ctx, opts.fullID, nsPath, namespaceOpts...)
if err != nil {
return fmt.Errorf("failed to call cni.Setup: %w", err)
}
for i, cniName := range opts.cniNames {
// Build per-network CNI_ARGS
extraArgs := [][2]string{
{"IgnoreUnknown", "1"},
{"NERDCTL_CNI_DHCP_HOSTNAME", opts.state.Annotations[labels.Hostname]},
}
if ip, ok := opts.ipPerNetwork[cniName]; ok && ip != "" {
extraArgs = append(extraArgs, [2]string{"IP", ip})
}
if opts.containerMAC != "" {
extraArgs = append(extraArgs, [2]string{"MAC", opts.containerMAC})
}

cniRes, setupErr := perNetworkAdd(ctx, opts, i, nsPath, extraArgs, capPortMappings)
if setupErr != nil {
return fmt.Errorf("failed to call cni.Setup for network %s: %w", cniName, setupErr)
}
if cniRes != nil {
hsMeta.Networks[cniName] = cniRes
}
}
} else {
// Legacy path: single IP (or no IP) shared across all networks
commonOpts := []cni.NamespaceOpts{}
commonOpts = append(commonOpts, portMapOpts...)
commonOpts = append(commonOpts, macAddressOpts...)
commonOpts = append(commonOpts, ip6AddressOpts...)
commonOpts = append(commonOpts,
cni.WithLabels(map[string]string{
"IgnoreUnknown": "1",
}),
cni.WithArgs("NERDCTL_CNI_DHCP_HOSTNAME", opts.state.Annotations[labels.Hostname]),
)
var namespaceOpts []cni.NamespaceOpts
namespaceOpts = append(namespaceOpts, commonOpts...)
namespaceOpts = append(namespaceOpts, ipAddressOpts...)

// When containerd gets bounced, containers that were previously running and that are restarted will go again
// through onCreateRuntime (*unlike* in a normal stop/start flow).
// As such, a container may very well have an ip already. The bridge plugin would thus refuse to loan a new one
// and error out, thus making the onCreateRuntime hook fail. In turn, runc (or containerd) will mis-interpret this,
// and subsequently call onPostStop (although the container will not get deleted), and we will release the name...
// leading to a bricked system where multiple containers may share the same name.
// Thus, we do pre-emptively clean things up - error is not checked, as in the majority of cases, that would
// legitimately error (and that does not matter)
// See https://github.com/containerd/nerdctl/issues/3355
_ = opts.cni.Remove(ctx, opts.fullID, "", namespaceOpts...)

// Defer CNI configuration removal to ensure idempotency of oci-hook.
defer func() {
if err != nil {
log.L.Warn("Container failed starting. Removing allocated network configuration.")
_ = opts.cni.Remove(ctx, opts.fullID, nsPath, namespaceOpts...)
}
}()

cniRes, err := opts.cni.Setup(ctx, opts.fullID, nsPath, namespaceOpts...)
if err != nil {
return fmt.Errorf("failed to call cni.Setup: %w", err)
}

cniResRaw := cniRes.Raw()
for i, cniName := range opts.cniNames {
hsMeta.Networks[cniName] = cniResRaw[i]
cniResRaw := cniRes.Raw()
for i, cniName := range opts.cniNames {
hsMeta.Networks[cniName] = cniResRaw[i]
}
}

b4nnEnabled, b4nnBindEnabled, err := bypass4netnsutil.IsBypass4netnsEnabled(opts.state.Annotations)
Expand Down Expand Up @@ -725,14 +861,35 @@ func onPostStop(opts *handlerOpts) error {
if err != nil {
return err
}
var namespaceOpts []cni.NamespaceOpts
namespaceOpts = append(namespaceOpts, portMapOpts...)
namespaceOpts = append(namespaceOpts, ipAddressOpts...)
namespaceOpts = append(namespaceOpts, macAddressOpts...)
namespaceOpts = append(namespaceOpts, ip6AddressOpts...)
if err := opts.cni.Remove(ctx, opts.fullID, "", namespaceOpts...); err != nil {
log.L.WithError(err).Errorf("failed to call cni.Remove")
return err

if len(opts.ipPerNetwork) > 0 {
// Per-network cleanup: remove each network individually with its own IP
// and the correct interface name (ethN).
var capPortMappings []cni.PortMapping
if len(opts.ports) > 0 {
capPortMappings = opts.ports
}
for i, cniName := range opts.cniNames {
extraArgs := [][2]string{
{"IgnoreUnknown", "1"},
}
if ip, ok := opts.ipPerNetwork[cniName]; ok && ip != "" {
extraArgs = append(extraArgs, [2]string{"IP", ip})
}
if delErr := perNetworkDel(ctx, opts, i, "", extraArgs, capPortMappings); delErr != nil {
log.L.WithError(delErr).Errorf("failed to call cni.Remove for network %s", cniName)
}
}
} else {
var namespaceOpts []cni.NamespaceOpts
namespaceOpts = append(namespaceOpts, portMapOpts...)
namespaceOpts = append(namespaceOpts, ipAddressOpts...)
namespaceOpts = append(namespaceOpts, macAddressOpts...)
namespaceOpts = append(namespaceOpts, ip6AddressOpts...)
if err := opts.cni.Remove(ctx, opts.fullID, "", namespaceOpts...); err != nil {
log.L.WithError(err).Errorf("failed to call cni.Remove")
return err
}
}

// opts.cni.Remove has trouble removing network configurations when netns is empty.
Expand Down
Loading