426 lines
12 KiB
Markdown
426 lines
12 KiB
Markdown
# SDN Setup für Ring Network mit Thunderbolt
|
||
|
||
- Datacenter auswählen
|
||
- SDN > Fabrics
|
||
- Add Fabric > OpenFabric
|
||
- Name: `tbfabric` (max 8 Zeichen)
|
||
- IPv4 Prefix: `10.0.21.0/29`
|
||
- Add Node (für alle Nodes des Clusters)
|
||
- IPv4 10.0.21.1 (statische IP aus Netz des Prefixes von oben)
|
||
- X `thunderbolt0` (keine IP zuweisen)
|
||
- X `thunderbolt1` (keine IP zuweisen)
|
||
- SDN auswählen im Hauptmenu wieder
|
||
- Apply
|
||
|
||
# Thunderbolt Interface Monitor für Proxmox
|
||
|
||
Automatisches Monitoring und Recovery für Thunderbolt-Interfaces zwischen Proxmox-Nodes.
|
||
|
||
## Features
|
||
|
||
- 🔍 Überwacht `thunderbolt0` und `thunderbolt1`
|
||
- 🔄 Automatischer Neustart bei Interface-Problemen
|
||
- 📝 Detailliertes Logging mit farbigen Status-Icons
|
||
- 🚀 systemd Service Integration
|
||
- 🔧 Konfigurierbare Check-Intervalle
|
||
- 📊 Log-Rotation bei großen Dateien
|
||
|
||
## Manuelle Installation
|
||
|
||
### 1. Thunderbolt Monitor Script erstellen
|
||
|
||
```bash
|
||
cat > /usr/local/bin/thunderbolt-monitor.sh << 'SCRIPT_EOF'
|
||
#!/bin/bash
|
||
|
||
# =========================
|
||
# Konfiguration
|
||
# =========================
|
||
INTERFACES=("thunderbolt0" "thunderbolt1")
|
||
CHECK_INTERVAL=30
|
||
LOG_FILE="/var/log/thunderbolt-monitor.log"
|
||
MAX_LOG_SIZE=10485760
|
||
|
||
# Ziel-IPs (eigene Dummy-IP aus /etc/network/interfaces.d/sdn wird automatisch ausgeschlossen)
|
||
PING_TARGETS=("10.0.21.1" "10.0.21.2" "10.0.21.3")
|
||
PING_COUNT=1
|
||
PING_TIMEOUT=1
|
||
SELF_CONF_FILE="/etc/network/interfaces.d/sdn"
|
||
SELF_IP=""
|
||
|
||
# Icons
|
||
ICON_INFO="🔵" # Info / UP keine Aktion
|
||
ICON_OK="✅" # Erfolg/Haken
|
||
ICON_ERR="❌" # Fehler
|
||
ICON_WARN="⚠️" # Warnung (z. B. Interface existiert nicht)
|
||
ICON_RESTART="🔄" # Neustart-Aktion
|
||
ICON_UP="🟢" # Up
|
||
ICON_DOWN="🔴" # Down
|
||
|
||
# Farbe Interface-Name (ANSI bleibt im Log → sichtbar mit `tail -f`)
|
||
IF_COLOR_START=$'\033[95m'
|
||
IF_COLOR_END=$'\033[0m'
|
||
|
||
# =========================
|
||
# Interner Zustand
|
||
# =========================
|
||
# Map: pro Interface die Liste zuletzt fehlgeschlagener Ping-Ziele (für Retry im nächsten Zyklus)
|
||
declare -A FAILMAP # key: iface, value: "ip1 ip2 ..."
|
||
|
||
# =========================
|
||
# Logging / Utilities
|
||
# =========================
|
||
ci() { printf '%b' "${IF_COLOR_START}$1${IF_COLOR_END}"; }
|
||
|
||
log_line() {
|
||
local icon="$1"; shift
|
||
local msg="$*"
|
||
local ts
|
||
ts=$(date '+%Y-%m-%d %H:%M:%S')
|
||
printf '[%s] %s %s\n' "$ts" "$icon" "$msg" | tee -a "$LOG_FILE"
|
||
}
|
||
|
||
rotate_log() {
|
||
if [[ -f "$LOG_FILE" ]]; then
|
||
local size
|
||
size=$(stat -c%s "$LOG_FILE" 2>/dev/null || stat -f%z "$LOG_FILE" 2>/dev/null || echo 0)
|
||
(( size > MAX_LOG_SIZE )) && { mv "$LOG_FILE" "${LOG_FILE}.old"; log_line "$ICON_INFO" "Log rotiert – alte Datei: ${LOG_FILE}.old"; }
|
||
fi
|
||
}
|
||
|
||
# Eigene Dummy-IP aus der sdn-Datei ermitteln (Zeile "address <IP>/<CIDR>" im Block von dummy_tbfabric)
|
||
detect_self_ip() {
|
||
[[ -r "$SELF_CONF_FILE" ]] || { SELF_IP=""; return 0; }
|
||
local addr_line
|
||
addr_line=$(awk '
|
||
BEGIN{found=0}
|
||
/^iface[ \t]+dummy_tbfabric[ \t]+inet[ \t]+static/{found=1;next}
|
||
found && /address[ \t]+/ {print; exit}
|
||
' "$SELF_CONF_FILE")
|
||
if [[ -n "$addr_line" ]]; then
|
||
local raw_ip
|
||
raw_ip=$(echo "$addr_line" | awk '{print $2}')
|
||
SELF_IP="${raw_ip%%/*}"
|
||
fi
|
||
}
|
||
|
||
# Source-IP des Interfaces ermitteln (erste IPv4)
|
||
iface_src_ip() {
|
||
local iface="$1"
|
||
ip -4 -o addr show dev "$iface" 2>/dev/null | awk '{print $4}' | cut -d/ -f1 | head -n1
|
||
}
|
||
|
||
# ROBUSTE PING-STRATEGIE:
|
||
# 1) routingbasiert (ohne -I)
|
||
# 2) -I <Interface> (Gerätebindung)
|
||
# 3) -I <Source-IP des Interfaces> (Quelle fixen)
|
||
ping_host() {
|
||
local iface="$1" ip="$2"
|
||
# 1) normal über Routing
|
||
if ping -c "$PING_COUNT" -W "$PING_TIMEOUT" "$ip" &>/dev/null; then
|
||
return 0
|
||
fi
|
||
# 2) Interface binden
|
||
if ping -I "$iface" -c "$PING_COUNT" -W "$PING_TIMEOUT" "$ip" &>/dev/null; then
|
||
return 0
|
||
fi
|
||
# 3) Source-IP binden
|
||
local src; src="$(iface_src_ip "$iface")"
|
||
if [[ -n "$src" ]]; then
|
||
ping -I "$src" -c "$PING_COUNT" -W "$PING_TIMEOUT" "$ip" &>/dev/null && return 0
|
||
fi
|
||
return 1
|
||
}
|
||
|
||
# Status ermitteln
|
||
is_interface_up() {
|
||
local iface="$1"
|
||
ip link show "$iface" &>/dev/null || return 2 # 2 = existiert nicht
|
||
local state oper
|
||
state=$(ip link show "$iface" | grep -o 'state [A-Z]*' | awk '{print $2}')
|
||
oper=$(cat "/sys/class/net/$iface/operstate" 2>/dev/null || echo "unknown")
|
||
[[ "$state" == "UP" && "$oper" == "up" ]] && return 0 || return 1
|
||
}
|
||
|
||
# Hilfen für FAILMAP
|
||
add_failed_targets() {
|
||
local iface="$1"; shift
|
||
local existing="${FAILMAP[$iface]}"
|
||
local -A seen=()
|
||
for ip in $existing "$@"; do
|
||
[[ -z "$ip" ]] && continue
|
||
seen["$ip"]=1
|
||
done
|
||
local merged=""
|
||
for ip in "${!seen[@]}"; do merged+="$ip "; done
|
||
FAILMAP[$iface]="${merged%% }"
|
||
}
|
||
|
||
retry_failed_targets() {
|
||
local iface="$1"
|
||
local list="${FAILMAP[$iface]}"
|
||
[[ -z "$list" ]] && return 0
|
||
|
||
log_line "$ICON_INFO" "Retry fehlgeschlagener Pings: $list"
|
||
local -a new_fail=()
|
||
local ip
|
||
for ip in $list; do
|
||
if ping_host "$iface" "$ip"; then
|
||
log_line "$ICON_OK" "Retry erfolgreich: $ip"
|
||
else
|
||
log_line "$ICON_ERR" "Retry fehlgeschlagen: $ip"
|
||
new_fail+=("$ip")
|
||
fi
|
||
done
|
||
if (( ${#new_fail[@]} == 0 )); then
|
||
log_line "$ICON_OK" "Alle zuvor fehlgeschlagenen Ziele nun erreichbar"
|
||
unset 'FAILMAP[$iface]'
|
||
else
|
||
FAILMAP[$iface]="${new_fail[*]}"
|
||
fi
|
||
}
|
||
|
||
# Neustart + Reachability-Checks (5s delay), ausführliches Logging
|
||
restart_with_checks() {
|
||
local iface="$1"
|
||
log_line "$ICON_RESTART" "Starte Interface $(ci "$iface") neu..."
|
||
/sbin/ifdown "$iface" &>/dev/null && log_line "$ICON_INFO" "Interface $(ci "$iface") heruntergefahren"
|
||
sleep 2
|
||
if /sbin/ifup "$iface" &>/dev/null; then
|
||
log_line "$ICON_OK" "Interface $(ci "$iface") erfolgreich gestartet"
|
||
sleep 5
|
||
|
||
# Gegenstellen pingen (eigene Dummy-IP ausschließen)
|
||
detect_self_ip
|
||
local -a targets=()
|
||
for ip in "${PING_TARGETS[@]}"; do
|
||
[[ -z "$ip" ]] && continue
|
||
[[ -n "$SELF_IP" && "$ip" == "$SELF_IP" ]] && continue
|
||
targets+=("$ip")
|
||
done
|
||
|
||
local all_ok=1
|
||
local -a failed_now=()
|
||
local ip
|
||
for ip in "${targets[@]}"; do
|
||
if ping_host "$iface" "$ip"; then
|
||
log_line "$ICON_OK" "Ping erfolgreich: $ip"
|
||
else
|
||
all_ok=0
|
||
failed_now+=("$ip")
|
||
log_line "$ICON_ERR" "Ping fehlgeschlagen: $ip"
|
||
fi
|
||
done
|
||
|
||
# Für nächste Runde merken
|
||
if (( ${#failed_now[@]} > 0 )); then
|
||
add_failed_targets "$iface" "${failed_now[@]}"
|
||
fi
|
||
|
||
if (( all_ok == 1 )); then
|
||
log_line "$ICON_OK" "Interface $(ci "$iface") erfolgreich neugestartet"
|
||
fi
|
||
else
|
||
log_line "$ICON_ERR" "Interface $(ci "$iface") konnte nicht gestartet werden"
|
||
fi
|
||
}
|
||
|
||
# Hauptroutine: alle 30s prüfen, je Interface separat handeln + ggf. Retry-Queue
|
||
monitor_loop() {
|
||
log_line "$ICON_INFO" "Thunderbolt Monitor gestartet (Intervall: ${CHECK_INTERVAL}s)"
|
||
detect_self_ip
|
||
[[ -n "$SELF_IP" ]] && log_line "$ICON_INFO" "Eigene Dummy-IP erkannt: $SELF_IP (von Pings ausgeschlossen)"
|
||
|
||
while true; do
|
||
rotate_log
|
||
|
||
for iface in "${INTERFACES[@]}"; do
|
||
is_interface_up "$iface"
|
||
case $? in
|
||
0)
|
||
log_line "$ICON_INFO" "Interface $(ci "$iface") ist UP – keine Aktion erforderlich."
|
||
# Retry offen gebliebener Ping-Fehlschläge
|
||
retry_failed_targets "$iface"
|
||
;;
|
||
1)
|
||
log_line "$ICON_DOWN" "Interface $(ci "$iface") ist DOWN – versuche Neustart."
|
||
restart_with_checks "$iface"
|
||
;;
|
||
2)
|
||
log_line "$ICON_WARN" "Interface $(ci "$iface") existiert nicht"
|
||
;;
|
||
esac
|
||
done
|
||
|
||
sleep "$CHECK_INTERVAL"
|
||
done
|
||
}
|
||
|
||
cleanup() { log_line "$ICON_WARN" "Thunderbolt Monitor wird beendet"; exit 0; }
|
||
trap cleanup SIGTERM SIGINT
|
||
|
||
case "${1:-}" in
|
||
"--test")
|
||
monitor_loop & pid=$!
|
||
sleep 3
|
||
kill $pid
|
||
;;
|
||
"--clear")
|
||
: > "$LOG_FILE"; log_line "$ICON_INFO" "Log-Datei geleert";;
|
||
*)
|
||
monitor_loop;;
|
||
esac
|
||
SCRIPT_EOF
|
||
|
||
chmod +x /usr/local/bin/thunderbolt-monitor.sh
|
||
```
|
||
|
||
### 2. systemd Service erstellen
|
||
|
||
```bash
|
||
cat > /etc/systemd/system/thunderbolt-monitor.service << 'EOF'
|
||
[Unit]
|
||
Description=🔌 Thunderbolt Interface Monitor
|
||
After=network.target frr.service
|
||
Wants=network.target
|
||
Requires=network.target
|
||
|
||
[Service]
|
||
Type=simple
|
||
User=root
|
||
ExecStart=/usr/local/bin/thunderbolt-monitor.sh
|
||
ExecReload=/bin/kill -HUP $MAINPID
|
||
Restart=always
|
||
RestartSec=10
|
||
StandardOutput=journal
|
||
StandardError=journal
|
||
|
||
# Umgebungsvariablen
|
||
Environment=PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
||
|
||
# Security Settings
|
||
NoNewPrivileges=true
|
||
ProtectSystem=strict
|
||
ReadWritePaths=/var/log /var/run /sys/class/net
|
||
|
||
[Install]
|
||
WantedBy=multi-user.target
|
||
EOF
|
||
```
|
||
|
||
### 3. Service aktivieren und starten
|
||
|
||
```bash
|
||
# Log-Datei vorbereiten
|
||
touch /var/log/thunderbolt-monitor.log
|
||
chmod 644 /var/log/thunderbolt-monitor.log
|
||
|
||
# Service aktivieren
|
||
systemctl daemon-reload
|
||
systemctl enable thunderbolt-monitor
|
||
systemctl start thunderbolt-monitor
|
||
```
|
||
|
||
## Verwendung
|
||
|
||
### Status-Icons im Log
|
||
|
||
| Icon | Bedeutung | Level |
|
||
|------|-----------|--------|
|
||
| 🔵 | Allgemeine Information | INFO |
|
||
| ✅ | Erfolgreiche Aktion | SUCCESS |
|
||
| ❌ | Fehler aufgetreten | ERROR |
|
||
| ⚠️ | Warnung | WARNING |
|
||
| 🔄 | Aktion wird ausgeführt | ACTION |
|
||
| 🟢 | Interface ist UP | UP |
|
||
| 🔴 | Interface ist DOWN | DOWN |
|
||
|
||
### Befehle
|
||
|
||
```bash
|
||
# Service starten
|
||
systemctl start thunderbolt-monitor
|
||
|
||
# Service stoppen
|
||
systemctl stop thunderbolt-monitor
|
||
|
||
# Status prüfen
|
||
systemctl status thunderbolt-monitor
|
||
|
||
# Live-Logs verfolgen
|
||
tail -f /var/log/thunderbolt-monitor.log
|
||
|
||
# journalctl Logs
|
||
journalctl -u thunderbolt-monitor -f
|
||
|
||
# Einmaliger Test
|
||
/usr/local/bin/thunderbolt-monitor.sh --test
|
||
|
||
# Manuell im Vordergrund starten
|
||
/usr/local/bin/thunderbolt-monitor.sh
|
||
```
|
||
|
||
### Konfiguration anpassen
|
||
|
||
```bash
|
||
# Script bearbeiten
|
||
nano /usr/local/bin/thunderbolt-monitor.sh
|
||
|
||
# Check-Interval ändern (in Sekunden)
|
||
CHECK_INTERVAL=30
|
||
|
||
# Überwachte Interfaces ändern
|
||
INTERFACES=("thunderbolt0" "thunderbolt1" "thunderbolt2")
|
||
```
|
||
|
||
## Log-Beispiele
|
||
|
||
```
|
||
[2025-08-14 10:59:10] 🔵 Thunderbolt Monitor gestartet (Intervall: 30s)
|
||
[2025-08-14 10:59:10] 🔵 Eigene Dummy-IP erkannt: 10.0.21.2 (von Pings ausgeschlossen)
|
||
[2025-08-14 10:59:10] 🔵 Interface thunderbolt0 ist UP - keine Aktion erforderlich.
|
||
[2025-08-14 10:59:10] 🔵 Interface thunderbolt1 ist UP - keine Aktion erforderlich.
|
||
[2025-08-14 10:59:40] 🔵 Interface thunderbolt0 ist UP - keine Aktion erforderlich.
|
||
[2025-08-14 10:59:40] 🔵 Interface thunderbolt1 ist UP - keine Aktion erforderlich.
|
||
[2025-08-14 11:00:10] 🔵 Interface thunderbolt0 ist UP - keine Aktion erforderlich.
|
||
[2025-08-14 11:00:10] 🔵 Interface thunderbolt1 ist UP - keine Aktion erforderlich.
|
||
[2025-08-14 11:00:40] 🔵 Interface thunderbolt0 ist UP - keine Aktion erforderlich.
|
||
[2025-08-14 11:00:40] ⚠️ Interface thunderbolt1 existiert nicht
|
||
[2025-08-14 11:01:10] 🔵 Interface thunderbolt0 ist UP - keine Aktion erforderlich.
|
||
[2025-08-14 11:01:10] 🔴 Interface thunderbolt1 ist DOWN - versuche Neustart.
|
||
[2025-08-14 11:01:10] 🔄 Starte Interface thunderbolt1 neu...
|
||
[2025-08-14 11:01:10] 🔵 Interface thunderbolt1 heruntergefahren
|
||
[2025-08-14 11:01:13] ✅ Interface thunderbolt1 erfolgreich gestartet
|
||
[2025-08-14 11:01:19] ✅ Ping erfolgreich: 10.0.21.1
|
||
[2025-08-14 11:01:19] ✅ Ping erfolgreich: 10.0.21.3
|
||
[2025-08-14 11:01:19] ✅ Interface thunderbolt1 erfolgreich neugestartet
|
||
[2025-08-14 11:01:49] 🔵 Interface thunderbolt0 ist UP - keine Aktion erforderlich.
|
||
[2025-08-14 11:01:49] 🔵 Interface thunderbolt1 ist UP - keine Aktion erforderlich.
|
||
```
|
||
|
||
## Troubleshooting
|
||
|
||
### Häufige Probleme
|
||
|
||
1. **Service startet nicht:**
|
||
```bash
|
||
journalctl -u thunderbolt-monitor -o cat -f
|
||
```
|
||
|
||
2. **Interfaces werden nicht erkannt:**
|
||
```bash
|
||
ip link show | grep thunderbolt
|
||
```
|
||
|
||
3. **Permissions-Probleme:**
|
||
```bash
|
||
ls -la /usr/local/bin/thunderbolt-monitor.sh
|
||
```
|
||
|
||
### Debug-Modus
|
||
|
||
```bash
|
||
# Service stoppen und manuell mit Debug starten
|
||
systemctl stop thunderbolt-monitor
|
||
/usr/local/bin/thunderbolt-monitor.sh --test
|
||
``` |