feat: Add K3s installer with complete lifecycle management

Implemented a production-ready K3s Kubernetes installer with full lifecycle
support including installation, startup management, and cleanup.

Key features:
- Install first master (cluster init), join additional masters (HA), and workers
- Systemd service management via StartupManager abstraction
- IPv6 support with Mycelium interface auto-detection
- Robust destroy/cleanup with proper ordering to prevent hanging
- Complete removal of services, processes, network interfaces, and data
This commit is contained in:
peternashaat
2025-11-27 14:01:53 +01:00
parent dc2f8c2976
commit b9b8e7ab75
5 changed files with 230 additions and 288 deletions

View File

@@ -1,217 +0,0 @@
need to install following
#!/bin/bash
set -euo pipefail
EXTRA_ARGS=""
log_info() {
echo '[INFO] ' "$@"
}
log_fatal() {
echo '[ERROR] ' "$@" >&2
exit 1
}
source_env_file() {
local env_file="${1:-}"
if [ ! -f "$env_file" ]; then
log_fatal "Environment file not found: $env_file"
fi
set -a
source "$env_file"
set +a
}
check_root() {
if [ "$EUID" -ne 0 ]; then
log_fatal "This script must be run as root"
fi
}
install_deps() {
log_info "Updating package lists..."
if ! apt-get update -qq > /dev/null 2>&1; then
log_fatal "Failed to update package lists"
fi
if ! command -v curl &> /dev/null; then
log_info "Installing curl..."
apt-get install -y -qq curl > /dev/null 2>&1 || log_fatal "Failed to install curl"
fi
if ! command -v ip &> /dev/null; then
log_info "Installing iproute2 for ip command..."
apt-get install -y -qq iproute2 > /dev/null 2>&1 || log_fatal "Failed to install iproute2"
fi
if ! command -v k3s &> /dev/null; then
log_info "Installing k3s..."
if ! curl -fsSL -o /usr/local/bin/k3s https://github.com/k3s-io/k3s/releases/download/v1.33.1+k3s1/k3s 2>/dev/null; then
log_fatal "Failed to download k3s"
fi
chmod +x /usr/local/bin/k3s
fi
if ! command -v kubectl &> /dev/null; then
log_info "Installing kubectl..."
if ! curl -fsSL -o /usr/local/bin/kubectl https://dl.k8s.io/release/v1.33.1/bin/linux/amd64/kubectl 2>/dev/null; then
log_fatal "Failed to download kubectl"
fi
chmod +x /usr/local/bin/kubectl
fi
}
get_iface_ipv6() {
local iface="$1"
# Step 1: Find the next-hop for 400::/7
local route_line
route_line=$(ip -6 route | grep "^400::/7.*dev ${iface}" || true)
if [ -z "$route_line" ]; then
log_fatal "No 400::/7 route found via interface ${iface}"
fi
# Extract next-hop IPv6
local nexthop
nexthop=$(echo "$route_line" | awk '{for(i=1;i<=NF;i++) if ($i=="via") print $(i+1)}')
local prefix
prefix=$(echo "$nexthop" | cut -d':' -f1-4)
# Step 3: Get global IPv6 addresses and match subnet
local ipv6_list
ipv6_list=$(ip -6 addr show dev "$iface" scope global | awk '/inet6/ {print $2}' | cut -d'/' -f1)
local ip ip_prefix
for ip in $ipv6_list; do
ip_prefix=$(echo "$ip" | cut -d':' -f1-4)
if [ "$ip_prefix" = "$prefix" ]; then
echo "$ip"
return 0
fi
done
log_fatal "No global IPv6 address found on ${iface} matching prefix ${prefix}"
}
prepare_args() {
log_info "Preparing k3s arguments..."
if [ -z "${K3S_FLANNEL_IFACE:-}" ]; then
log_fatal "K3S_FLANNEL_IFACE not set, it should be your mycelium interface"
else
local ipv6
ipv6=$(get_iface_ipv6 "$K3S_FLANNEL_IFACE")
EXTRA_ARGS="$EXTRA_ARGS --node-ip=$ipv6"
fi
if [ -n "${K3S_DATA_DIR:-}" ]; then
log_info "k3s data-dir set to: $K3S_DATA_DIR"
if [ -d "/var/lib/rancher/k3s" ] && [ -n "$(ls -A /var/lib/rancher/k3s 2>/dev/null)" ]; then
cp -r /var/lib/rancher/k3s/* $K3S_DATA_DIR && rm -rf /var/lib/rancher/k3s
fi
EXTRA_ARGS="$EXTRA_ARGS --data-dir $K3S_DATA_DIR --kubelet-arg=root-dir=$K3S_DATA_DIR/kubelet"
fi
if [[ "${MASTER:-}" = "true" ]]; then
EXTRA_ARGS="$EXTRA_ARGS --cluster-cidr=2001:cafe:42::/56"
EXTRA_ARGS="$EXTRA_ARGS --service-cidr=2001:cafe:43::/112"
EXTRA_ARGS="$EXTRA_ARGS --flannel-ipv6-masq"
fi
if [ -z "${K3S_URL:-}" ]; then
# Add additional SANs for planetary network IP, public IPv4, and public IPv6
# https://github.com/threefoldtech/tf-images/issues/98
local ifaces=( "tun0" "eth1" "eth2" )
for iface in "${ifaces[@]}"
do
# Check if interface exists before querying
if ! ip addr show "$iface" &>/dev/null; then
continue
fi
local addrs
addrs=$(ip addr show "$iface" 2>/dev/null | grep -E "inet |inet6 " | grep "global" | cut -d '/' -f1 | awk '{print $2}' || true)
local addr
for addr in $addrs
do
# Validate the IP address by trying to route to it
if ip route get "$addr" &>/dev/null; then
EXTRA_ARGS="$EXTRA_ARGS --tls-san $addr"
fi
done
done
if [ "${HA:-}" = "true" ]; then
EXTRA_ARGS="$EXTRA_ARGS --cluster-init"
fi
else
if [ -z "${K3S_TOKEN:-}" ]; then
log_fatal "K3S_TOKEN must be set when K3S_URL is specified (joining a cluster)"
fi
fi
}
patch_manifests() {
log_info "Patching manifests..."
dir="${K3S_DATA_DIR:-/var/lib/rancher/k3s}"
manifest="$dir/server/manifests/tfgw-crd.yaml"
# If K3S_URL found, remove manifest and exit. it is an agent node
if [[ -n "${K3S_URL:-}" ]]; then
rm -f "$manifest"
log_info "Agent node detected, removed manifest: $manifest"
exit 0
fi
# If K3S_URL not found, patch the manifest. it is a server node
[[ ! -f "$manifest" ]] && echo "Manifest not found: $manifest" >&2 && exit 1
sed -i \
-e "s|\${MNEMONIC}|${MNEMONIC:-}|g" \
-e "s|\${NETWORK}|${NETWORK:-}|g" \
-e "s|\${TOKEN}|${TOKEN:-}|g" \
"$manifest"
}
run_node() {
if [ -z "${K3S_URL:-}" ]; then
log_info "Starting k3s server (initializing new cluster)..."
log_info "Command: k3s server --flannel-iface $K3S_FLANNEL_IFACE $EXTRA_ARGS"
exec k3s server --flannel-iface "$K3S_FLANNEL_IFACE" $EXTRA_ARGS 2>&1
elif [ "${MASTER:-}" = "true" ]; then
log_info "Starting k3s server (joining existing cluster as master)..."
log_info "Command: k3s server --server $K3S_URL --flannel-iface $K3S_FLANNEL_IFACE $EXTRA_ARGS"
exec k3s server --server "$K3S_URL" --flannel-iface "$K3S_FLANNEL_IFACE" $EXTRA_ARGS 2>&1
else
log_info "Starting k3s agent (joining existing cluster as worker)..."
log_info "Command: k3s agent --server $K3S_URL --flannel-iface $K3S_FLANNEL_IFACE $EXTRA_ARGS"
exec k3s agent --server "$K3S_URL" --flannel-iface "$K3S_FLANNEL_IFACE" $EXTRA_ARGS 2>&1
fi
}
main() {
source_env_file "${1:-}"
check_root
install_deps
prepare_args
patch_manifests
run_node
}
main "$@"
INSTRUCTIONS: USE HEROLIB AS MUCH AS POSSIBLE e.g. SAL

View File

@@ -70,13 +70,10 @@ fn running() !bool {
// Check if k3s process is running
res := osal.exec(cmd: 'pgrep -f "k3s (server|agent)"', stdout: false, raise_error: false)!
if res.exit_code == 0 {
// Also check if kubectl can connect
kubectl_res := osal.exec(
cmd: 'kubectl get nodes'
stdout: false
raise_error: false
)!
return kubectl_res.exit_code == 0
// K3s process is running, that's enough for basic check
// We don't check kubectl connectivity here as it might not be ready immediately
// and could hang if kubeconfig is not properly configured
return true
}
return false
}
@@ -332,33 +329,91 @@ pub fn (self &KubernetesInstaller) generate_join_script() !string {
fn destroy() ! {
console.print_header('Destroying K3s installation')
// Stop K3s if running
osal.process_kill_recursive(name: 'k3s')!
// Get configuration to find data directory, or use default
data_dir := if cfg := get() {
cfg.data_dir
// Get configuration to find data directory
// Try to get from current configuration, otherwise use common paths
mut data_dirs := []string{}
if cfg := get() {
data_dirs << cfg.data_dir
console.print_debug('Found configured data directory: ${cfg.data_dir}')
} else {
console.print_debug('No configuration found, using default paths')
'/var/lib/rancher/k3s'
console.print_debug('No configuration found, will clean up common K3s paths')
}
// Always add common K3s directories to ensure complete cleanup
data_dirs << '/var/lib/rancher/k3s'
data_dirs << '/root/hero/var/k3s'
// Clean up network interfaces
cleanup_network()!
// CRITICAL: Complete systemd service deletion FIRST before any other cleanup
// This prevents the service from auto-restarting during cleanup
// Step 1: Stop and delete ALL k3s systemd services using startupmanager
console.print_header('Stopping and removing systemd services...')
// Get systemd startup manager
mut sm := startupmanager_get(.systemd) or {
console.print_debug('Failed to get systemd manager: ${err}')
return error('Could not get systemd manager: ${err}')
}
// List all k3s services
all_services := sm.list() or {
console.print_debug('Failed to list services: ${err}')
[]string{}
}
// Filter and delete k3s services
for service_name in all_services {
if service_name.starts_with('k3s_') {
console.print_debug('Deleting systemd service: ${service_name}')
// Use startupmanager.delete() which properly stops, disables, and removes the service
sm.delete(service_name) or {
console.print_debug('Failed to delete service ${service_name}: ${err}')
}
}
}
console.print_header(' Systemd services removed')
// Unmount kubelet mounts
// Step 2: Kill any remaining K3s processes
console.print_header('Killing any remaining K3s processes...')
osal.exec(cmd: 'killall -9 k3s 2>/dev/null || true', stdout: false, raise_error: false) or {
console.print_debug('No k3s processes to kill or killall failed')
}
// Wait for processes to fully terminate
osal.exec(cmd: 'sleep 2', stdout: false) or {}
// Step 3: Unmount kubelet mounts (before network cleanup)
cleanup_mounts()!
// Remove data directory
if data_dir != '' {
console.print_header('Removing data directory: ${data_dir}')
osal.rm(data_dir)!
// Step 4: Clean up network interfaces (after processes are stopped)
cleanup_network()!
// Step 5: Remove data directories
console.print_header('Removing data directories...')
// Remove all K3s data directories (deduplicated)
mut cleaned_dirs := map[string]bool{}
for data_dir in data_dirs {
if data_dir != '' && data_dir !in cleaned_dirs {
cleaned_dirs[data_dir] = true
console.print_debug('Removing data directory: ${data_dir}')
osal.exec(cmd: 'rm -rf ${data_dir}', stdout: false, raise_error: false) or {
console.print_debug('Failed to remove ${data_dir}: ${err}')
}
}
}
// Also remove /etc/rancher which K3s creates
console.print_debug('Removing /etc/rancher')
osal.exec(cmd: 'rm -rf /etc/rancher', stdout: false, raise_error: false) or {}
// Clean up CNI
osal.exec(cmd: 'rm -rf /var/lib/cni/', stdout: false) or {}
// Step 6: Clean up CNI
console.print_header('Cleaning up CNI directories...')
osal.exec(cmd: 'rm -rf /var/lib/cni/', stdout: false, raise_error: false) or {}
// Clean up iptables rules
// Step 7: Clean up iptables rules
console.print_header('Cleaning up iptables rules')
osal.exec(
cmd: 'iptables-save | grep -v KUBE- | grep -v CNI- | grep -iv flannel | iptables-restore'
@@ -378,24 +433,59 @@ fn cleanup_network() ! {
console.print_header('Cleaning up network interfaces')
// Remove interfaces that are slaves of cni0
osal.exec(
cmd: 'ip link show | grep "master cni0" | awk -F: \'{print $2}\' | xargs -r -n1 ip link delete'
// Get the list first, then delete one by one
if veth_result := osal.exec(
cmd: 'ip link show | grep "master cni0" | awk -F: \'{print $2}\' | xargs'
stdout: false
raise_error: false
) or {}
) {
if veth_result.output.trim_space() != '' {
veth_interfaces := veth_result.output.trim_space().split(' ')
for veth in veth_interfaces {
veth_trimmed := veth.trim_space()
if veth_trimmed != '' {
console.print_debug('Deleting veth interface: ${veth_trimmed}')
osal.exec(cmd: 'ip link delete ${veth_trimmed}', stdout: false, raise_error: false) or {
console.print_debug('Failed to delete ${veth_trimmed}, continuing...')
}
}
}
}
} else {
console.print_debug('No veth interfaces found or error getting list')
}
// Remove CNI-related interfaces
interfaces := ['cni0', 'flannel.1', 'flannel-v6.1', 'kube-ipvs0', 'flannel-wg', 'flannel-wg-v6']
for iface in interfaces {
osal.exec(cmd: 'ip link delete ${iface}', stdout: false, raise_error: false) or {}
console.print_debug('Deleting interface: ${iface}')
// Use timeout to prevent hanging, and redirect stderr to avoid blocking
osal.exec(cmd: 'timeout 5 ip link delete ${iface} 2>/dev/null || true', stdout: false, raise_error: false) or {
console.print_debug('Interface ${iface} not found or already deleted')
}
}
// Remove CNI namespaces
osal.exec(
cmd: 'ip netns show | grep cni- | xargs -r -n1 ip netns delete'
if ns_result := osal.exec(
cmd: 'ip netns show | grep cni- | xargs'
stdout: false
raise_error: false
) or {}
) {
if ns_result.output.trim_space() != '' {
namespaces := ns_result.output.trim_space().split(' ')
for ns in namespaces {
ns_trimmed := ns.trim_space()
if ns_trimmed != '' {
console.print_debug('Deleting namespace: ${ns_trimmed}')
osal.exec(cmd: 'ip netns delete ${ns_trimmed}', stdout: false, raise_error: false) or {
console.print_debug('Failed to delete namespace ${ns_trimmed}')
}
}
}
}
} else {
console.print_debug('No CNI namespaces found')
}
}
fn cleanup_mounts() ! {
@@ -406,13 +496,29 @@ fn cleanup_mounts() ! {
for path in paths {
// Find all mounts under this path and unmount them
osal.exec(
cmd: 'mount | grep "${path}" | awk \'{print $3}\' | sort -r | xargs -r -n1 umount -f'
if mount_result := osal.exec(
cmd: 'mount | grep "${path}" | awk \'{print $3}\' | sort -r'
stdout: false
raise_error: false
) or {}
) {
if mount_result.output.trim_space() != '' {
mount_points := mount_result.output.split_into_lines()
for mount_point in mount_points {
mp_trimmed := mount_point.trim_space()
if mp_trimmed != '' {
console.print_debug('Unmounting: ${mp_trimmed}')
osal.exec(cmd: 'umount -f ${mp_trimmed}', stdout: false, raise_error: false) or {
console.print_debug('Failed to unmount ${mp_trimmed}')
}
}
}
}
} else {
console.print_debug('No mounts found for ${path}')
}
// Remove the directory
console.print_debug('Removing directory: ${path}')
osal.exec(cmd: 'rm -rf ${path}', stdout: false, raise_error: false) or {}
}
}

View File

@@ -20,8 +20,8 @@ pub mut:
data_dir string
// Unique node name/identifier
node_name string
// Mycelium interface name (default: mycelium0)
mycelium_interface string = 'mycelium0'
// Mycelium interface name (auto-detected if not specified)
mycelium_interface string
// Cluster token for authentication (auto-generated if empty)
token string
// Master URL for joining cluster (e.g., 'https://[ipv6]:6443')
@@ -54,6 +54,11 @@ fn obj_init(mycfg_ KubernetesInstaller) !KubernetesInstaller {
mycfg.node_name = if hostname != '' { hostname } else { 'k3s-node-${rand.hex(4)}' }
}
// Auto-detect Mycelium interface if not provided
if mycfg.mycelium_interface == '' {
mycfg.mycelium_interface = detect_mycelium_interface()!
}
// Generate token if not provided and this is the first master
if mycfg.token == '' && mycfg.is_first_master {
// Generate a secure random token
@@ -82,6 +87,33 @@ pub fn (self &KubernetesInstaller) get_mycelium_ipv6() !string {
return get_mycelium_ipv6_from_interface(self.mycelium_interface)!
}
// Auto-detect Mycelium interface by finding 400::/7 route
fn detect_mycelium_interface() !string {
// Find all 400::/7 routes
route_result := osal.exec(
cmd: 'ip -6 route | grep "^400::/7"'
stdout: false
raise_error: false
)!
if route_result.exit_code != 0 || route_result.output.trim_space() == '' {
return error('No Mycelium interface found (no 400::/7 route detected). Please ensure Mycelium is installed and running.')
}
// Parse interface name from route (format: "400::/7 dev <interface> ...")
route_line := route_result.output.trim_space()
parts := route_line.split(' ')
for i, part in parts {
if part == 'dev' && i + 1 < parts.len {
iface := parts[i + 1]
return iface
}
}
return error('Could not parse Mycelium interface from route output: ${route_line}')
}
// Helper function to detect Mycelium IPv6 from interface
fn get_mycelium_ipv6_from_interface(iface string) !string {
// Step 1: Find the 400::/7 route via the interface
@@ -95,8 +127,15 @@ fn get_mycelium_ipv6_from_interface(iface string) !string {
return error('No 400::/7 route found via interface ${iface}')
}
// Step 2: Extract next-hop IPv6 and get prefix (first 4 segments)
// Parse: "400::/7 via <nexthop> dev <iface> ..."
// Step 2: Get all global IPv6 addresses on the interface
addr_result := osal.exec(
cmd: 'ip -6 addr show dev ${iface} scope global | grep inet6 | awk \'{print $2}\' | cut -d/ -f1'
stdout: false
)!
ipv6_list := addr_result.output.split_into_lines()
// Check if route has a next-hop (via keyword)
parts := route_line.split(' ')
mut nexthop := ''
for i, part in parts {
@@ -106,42 +145,47 @@ fn get_mycelium_ipv6_from_interface(iface string) !string {
}
}
if nexthop == '' {
return error('Could not extract next-hop from route: ${route_line}')
}
if nexthop != '' {
// Route has a next-hop: match by prefix (first 4 segments)
prefix_parts := nexthop.split(':')
if prefix_parts.len < 4 {
return error('Invalid IPv6 next-hop format: ${nexthop}')
}
prefix := prefix_parts[0..4].join(':')
// Get first 4 segments of IPv6 address (prefix)
prefix_parts := nexthop.split(':')
if prefix_parts.len < 4 {
return error('Invalid IPv6 next-hop format: ${nexthop}')
}
prefix := prefix_parts[0..4].join(':')
// Step 3: Match the one with the same prefix
for ip in ipv6_list {
ip_trimmed := ip.trim_space()
if ip_trimmed == '' {
continue
}
// Step 3: Get all global IPv6 addresses on the interface
addr_result := osal.exec(
cmd: 'ip -6 addr show dev ${iface} scope global | grep inet6 | awk \'{print $2}\' | cut -d/ -f1'
stdout: false
)!
ipv6_list := addr_result.output.split_into_lines()
// Step 4: Match the one with the same prefix
for ip in ipv6_list {
ip_trimmed := ip.trim_space()
if ip_trimmed == '' {
continue
ip_parts := ip_trimmed.split(':')
if ip_parts.len >= 4 {
ip_prefix := ip_parts[0..4].join(':')
if ip_prefix == prefix {
return ip_trimmed
}
}
}
ip_parts := ip_trimmed.split(':')
if ip_parts.len >= 4 {
ip_prefix := ip_parts[0..4].join(':')
if ip_prefix == prefix {
return error('No global IPv6 address found on ${iface} matching prefix ${prefix}')
} else {
// Direct route (no via): return the first IPv6 address in 400::/7 range
for ip in ipv6_list {
ip_trimmed := ip.trim_space()
if ip_trimmed == '' {
continue
}
// Check if IP is in 400::/7 range (starts with 4 or 5)
if ip_trimmed.starts_with('4') || ip_trimmed.starts_with('5') {
return ip_trimmed
}
}
}
return error('No global IPv6 address found on ${iface} matching prefix ${prefix}')
return error('No global IPv6 address found on ${iface} in 400::/7 range')
}
}
// called before start if done

View File

@@ -1,3 +0,0 @@
https://github.com/codescalers/kubecloud/blob/master/k3s/native_guide/k3s_killall.sh
still need to implement this

View File

@@ -74,7 +74,7 @@ kubernetes_installer.play(heroscript: heroscript)!
| `k3s_version` | string | 'v1.33.1' | K3s version to install |
| `data_dir` | string | '~/hero/var/k3s' | Data directory for K3s |
| `node_name` | string | hostname | Unique node identifier |
| `mycelium_interface` | string | 'mycelium0' | Mycelium interface name |
| `mycelium_interface` | string | auto-detected | Mycelium interface name (auto-detected from 400::/7 route) |
| `token` | string | auto-generated | Cluster authentication token |
| `master_url` | string | - | Master URL for joining (e.g., 'https://[ipv6]:6443') |
| `node_ip` | string | auto-detected | Node IPv6 (auto-detected from Mycelium) |
@@ -121,17 +121,20 @@ This ensures K3s binds to the correct Mycelium IPv6 even if the server has other
### Cluster Setup
**First Master:**
- Uses `--cluster-init` flag
- Auto-generates secure token
- Configures IPv6 CIDRs: cluster=2001:cafe:42::/56, service=2001:cafe:43::/112
- Generates join script for other nodes
**Additional Masters:**
- Joins with `--server <master_url>`
- Requires token and master_url from first master
- Provides HA for control plane
**Workers:**
- Joins as agent with `--server <master_url>`
- Requires token and master_url from first master
@@ -149,24 +152,28 @@ The `destroy` action performs complete cleanup:
## Example Workflow
1. **Install first master on server1:**
```bash
hero run templates/examples.heroscript
# Note the token and IPv6 address displayed
```
2. **Join additional master on server2:**
```bash
# Edit examples.heroscript Section 2 with token and master_url
hero run templates/examples.heroscript
```
3. **Add worker on server3:**
```bash
# Edit examples.heroscript Section 3 with token and master_url
hero run templates/examples.heroscript
```
4. **Verify cluster:**
```bash
kubectl get nodes
kubectl get pods --all-namespaces
@@ -177,12 +184,14 @@ The `destroy` action performs complete cleanup:
The kubeconfig is located at: `<data_dir>/server/cred/admin.kubeconfig`
To use kubectl:
```bash
export KUBECONFIG=~/hero/var/k3s/server/cred/admin.kubeconfig
kubectl get nodes
```
Or copy to default location:
```bash
mkdir -p ~/.kube
cp ~/hero/var/k3s/server/cred/admin.kubeconfig ~/.kube/config
@@ -191,16 +200,19 @@ cp ~/hero/var/k3s/server/cred/admin.kubeconfig ~/.kube/config
## Troubleshooting
**K3s won't start:**
- Check if Mycelium is running: `ip -6 addr show mycelium0`
- Verify 400::/7 route exists: `ip -6 route | grep 400::/7`
- Check logs: `journalctl -u k3s_* -f`
**Can't join cluster:**
- Verify token matches first master
- Ensure master_url uses correct IPv6 in brackets: `https://[ipv6]:6443`
- Check network connectivity over Mycelium: `ping6 <master_ipv6>`
**Cleanup issues:**
- Run destroy with sudo if needed
- Manually check for remaining processes: `pgrep -f k3s`
- Check for remaining mounts: `mount | grep k3s`