Add Proxmox/ThunderboltNetworking_PVE9.md
This commit is contained in:
412
Proxmox/ThunderboltNetworking_PVE9.md
Normal file
412
Proxmox/ThunderboltNetworking_PVE9.md
Normal file
@@ -0,0 +1,412 @@
|
||||
# Thunderbolt Interface Monitor für Proxmox
|
||||
|
||||
Automatisches Monitoring und Recovery für Thunderbolt-Interfaces zwischen Proxmox-Nodes.
|
||||
|
||||
## Features
|
||||
|
||||
- 🔍 Überwacht `thunderbolt0` und `thunderbolt1`
|
||||
- 🔄 Automatischer Neustart bei Interface-Problemen
|
||||
- 📝 Detailliertes Logging mit farbigen Status-Icons
|
||||
- 🚀 systemd Service Integration
|
||||
- 🔧 Konfigurierbare Check-Intervalle
|
||||
- 📊 Log-Rotation bei großen Dateien
|
||||
|
||||
## Manuelle Installation
|
||||
|
||||
### 1. Thunderbolt Monitor Script erstellen
|
||||
|
||||
```bash
|
||||
cat > /usr/local/bin/thunderbolt-monitor.sh << 'SCRIPT_EOF'
|
||||
#!/bin/bash
|
||||
|
||||
# =========================
|
||||
# Konfiguration
|
||||
# =========================
|
||||
INTERFACES=("thunderbolt0" "thunderbolt1")
|
||||
CHECK_INTERVAL=30
|
||||
LOG_FILE="/var/log/thunderbolt-monitor.log"
|
||||
MAX_LOG_SIZE=10485760
|
||||
|
||||
# Ziel-IPs (eigene Dummy-IP aus /etc/network/interfaces.d/sdn wird automatisch ausgeschlossen)
|
||||
PING_TARGETS=("10.0.21.1" "10.0.21.2" "10.0.21.3")
|
||||
PING_COUNT=1
|
||||
PING_TIMEOUT=1
|
||||
SELF_CONF_FILE="/etc/network/interfaces.d/sdn"
|
||||
SELF_IP=""
|
||||
|
||||
# Icons
|
||||
ICON_INFO="🔵" # Info / UP keine Aktion
|
||||
ICON_OK="✅" # Erfolg/Haken
|
||||
ICON_ERR="❌" # Fehler
|
||||
ICON_WARN="⚠️" # Warnung (z. B. Interface existiert nicht)
|
||||
ICON_RESTART="🔄" # Neustart-Aktion
|
||||
ICON_UP="🟢" # Up
|
||||
ICON_DOWN="🔴" # Down
|
||||
|
||||
# Farbe Interface-Name (ANSI bleibt im Log → sichtbar mit `tail -f`)
|
||||
IF_COLOR_START=$'\033[95m'
|
||||
IF_COLOR_END=$'\033[0m'
|
||||
|
||||
# =========================
|
||||
# Interner Zustand
|
||||
# =========================
|
||||
# Map: pro Interface die Liste zuletzt fehlgeschlagener Ping-Ziele (für Retry im nächsten Zyklus)
|
||||
declare -A FAILMAP # key: iface, value: "ip1 ip2 ..."
|
||||
|
||||
# =========================
|
||||
# Logging / Utilities
|
||||
# =========================
|
||||
ci() { printf '%b' "${IF_COLOR_START}$1${IF_COLOR_END}"; }
|
||||
|
||||
log_line() {
|
||||
local icon="$1"; shift
|
||||
local msg="$*"
|
||||
local ts
|
||||
ts=$(date '+%Y-%m-%d %H:%M:%S')
|
||||
printf '[%s] %s %s\n' "$ts" "$icon" "$msg" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
rotate_log() {
|
||||
if [[ -f "$LOG_FILE" ]]; then
|
||||
local size
|
||||
size=$(stat -c%s "$LOG_FILE" 2>/dev/null || stat -f%z "$LOG_FILE" 2>/dev/null || echo 0)
|
||||
(( size > MAX_LOG_SIZE )) && { mv "$LOG_FILE" "${LOG_FILE}.old"; log_line "$ICON_INFO" "Log rotiert – alte Datei: ${LOG_FILE}.old"; }
|
||||
fi
|
||||
}
|
||||
|
||||
# Eigene Dummy-IP aus der sdn-Datei ermitteln (Zeile "address <IP>/<CIDR>" im Block von dummy_tbfabric)
|
||||
detect_self_ip() {
|
||||
[[ -r "$SELF_CONF_FILE" ]] || { SELF_IP=""; return 0; }
|
||||
local addr_line
|
||||
addr_line=$(awk '
|
||||
BEGIN{found=0}
|
||||
/^iface[ \t]+dummy_tbfabric[ \t]+inet[ \t]+static/{found=1;next}
|
||||
found && /address[ \t]+/ {print; exit}
|
||||
' "$SELF_CONF_FILE")
|
||||
if [[ -n "$addr_line" ]]; then
|
||||
local raw_ip
|
||||
raw_ip=$(echo "$addr_line" | awk '{print $2}')
|
||||
SELF_IP="${raw_ip%%/*}"
|
||||
fi
|
||||
}
|
||||
|
||||
# Source-IP des Interfaces ermitteln (erste IPv4)
|
||||
iface_src_ip() {
|
||||
local iface="$1"
|
||||
ip -4 -o addr show dev "$iface" 2>/dev/null | awk '{print $4}' | cut -d/ -f1 | head -n1
|
||||
}
|
||||
|
||||
# ROBUSTE PING-STRATEGIE:
|
||||
# 1) routingbasiert (ohne -I)
|
||||
# 2) -I <Interface> (Gerätebindung)
|
||||
# 3) -I <Source-IP des Interfaces> (Quelle fixen)
|
||||
ping_host() {
|
||||
local iface="$1" ip="$2"
|
||||
# 1) normal über Routing
|
||||
if ping -c "$PING_COUNT" -W "$PING_TIMEOUT" "$ip" &>/dev/null; then
|
||||
return 0
|
||||
fi
|
||||
# 2) Interface binden
|
||||
if ping -I "$iface" -c "$PING_COUNT" -W "$PING_TIMEOUT" "$ip" &>/dev/null; then
|
||||
return 0
|
||||
fi
|
||||
# 3) Source-IP binden
|
||||
local src; src="$(iface_src_ip "$iface")"
|
||||
if [[ -n "$src" ]]; then
|
||||
ping -I "$src" -c "$PING_COUNT" -W "$PING_TIMEOUT" "$ip" &>/dev/null && return 0
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
# Status ermitteln
|
||||
is_interface_up() {
|
||||
local iface="$1"
|
||||
ip link show "$iface" &>/dev/null || return 2 # 2 = existiert nicht
|
||||
local state oper
|
||||
state=$(ip link show "$iface" | grep -o 'state [A-Z]*' | awk '{print $2}')
|
||||
oper=$(cat "/sys/class/net/$iface/operstate" 2>/dev/null || echo "unknown")
|
||||
[[ "$state" == "UP" && "$oper" == "up" ]] && return 0 || return 1
|
||||
}
|
||||
|
||||
# Hilfen für FAILMAP
|
||||
add_failed_targets() {
|
||||
local iface="$1"; shift
|
||||
local existing="${FAILMAP[$iface]}"
|
||||
local -A seen=()
|
||||
for ip in $existing "$@"; do
|
||||
[[ -z "$ip" ]] && continue
|
||||
seen["$ip"]=1
|
||||
done
|
||||
local merged=""
|
||||
for ip in "${!seen[@]}"; do merged+="$ip "; done
|
||||
FAILMAP[$iface]="${merged%% }"
|
||||
}
|
||||
|
||||
retry_failed_targets() {
|
||||
local iface="$1"
|
||||
local list="${FAILMAP[$iface]}"
|
||||
[[ -z "$list" ]] && return 0
|
||||
|
||||
log_line "$ICON_INFO" "Retry fehlgeschlagener Pings über $(ci "$iface"): $list"
|
||||
local -a new_fail=()
|
||||
local ip
|
||||
for ip in $list; do
|
||||
if ping_host "$iface" "$ip"; then
|
||||
log_line "$ICON_OK" "Retry erfolgreich: $ip über $(ci "$iface")"
|
||||
else
|
||||
log_line "$ICON_ERR" "Retry fehlgeschlagen: $ip über $(ci "$iface")"
|
||||
new_fail+=("$ip")
|
||||
fi
|
||||
done
|
||||
if (( ${#new_fail[@]} == 0 )); then
|
||||
log_line "$ICON_OK" "Alle zuvor fehlgeschlagenen Ziele über $(ci "$iface") nun erreichbar"
|
||||
unset 'FAILMAP[$iface]'
|
||||
else
|
||||
FAILMAP[$iface]="${new_fail[*]}"
|
||||
fi
|
||||
}
|
||||
|
||||
# Neustart + Reachability-Checks (5s delay), ausführliches Logging
|
||||
restart_with_checks() {
|
||||
local iface="$1"
|
||||
log_line "$ICON_RESTART" "Starte Interface $(ci "$iface") neu..."
|
||||
/sbin/ifdown "$iface" &>/dev/null && log_line "$ICON_INFO" "Interface $(ci "$iface") heruntergefahren"
|
||||
sleep 2
|
||||
if /sbin/ifup "$iface" &>/dev/null; then
|
||||
log_line "$ICON_OK" "Interface $(ci "$iface") erfolgreich gestartet"
|
||||
sleep 5
|
||||
|
||||
# Gegenstellen pingen (eigene Dummy-IP ausschließen)
|
||||
detect_self_ip
|
||||
local -a targets=()
|
||||
for ip in "${PING_TARGETS[@]}"; do
|
||||
[[ -z "$ip" ]] && continue
|
||||
[[ -n "$SELF_IP" && "$ip" == "$SELF_IP" ]] && continue
|
||||
targets+=("$ip")
|
||||
done
|
||||
|
||||
local all_ok=1
|
||||
local -a failed_now=()
|
||||
local ip
|
||||
for ip in "${targets[@]}"; do
|
||||
if ping_host "$iface" "$ip"; then
|
||||
log_line "$ICON_OK" "Ping erfolgreich: $ip über $(ci "$iface")"
|
||||
else
|
||||
all_ok=0
|
||||
failed_now+=("$ip")
|
||||
log_line "$ICON_ERR" "Ping fehlgeschlagen: $ip über $(ci "$iface")"
|
||||
fi
|
||||
done
|
||||
|
||||
# Für nächste Runde merken
|
||||
if (( ${#failed_now[@]} > 0 )); then
|
||||
add_failed_targets "$iface" "${failed_now[@]}"
|
||||
fi
|
||||
|
||||
if (( all_ok == 1 )); then
|
||||
log_line "$ICON_OK" "Interface $(ci "$iface") erfolgreich neugestartet"
|
||||
fi
|
||||
else
|
||||
log_line "$ICON_ERR" "Interface $(ci "$iface") konnte nicht gestartet werden"
|
||||
fi
|
||||
}
|
||||
|
||||
# Hauptroutine: alle 30s prüfen, je Interface separat handeln + ggf. Retry-Queue
|
||||
monitor_loop() {
|
||||
log_line "$ICON_INFO" "Thunderbolt Monitor gestartet (Intervall: ${CHECK_INTERVAL}s)"
|
||||
detect_self_ip
|
||||
[[ -n "$SELF_IP" ]] && log_line "$ICON_INFO" "Eigene Dummy-IP erkannt: $SELF_IP (von Pings ausgeschlossen)"
|
||||
|
||||
while true; do
|
||||
rotate_log
|
||||
|
||||
for iface in "${INTERFACES[@]}"; do
|
||||
is_interface_up "$iface"
|
||||
case $? in
|
||||
0)
|
||||
log_line "$ICON_INFO" "Interface $(ci "$iface") ist UP – keine Aktion erforderlich."
|
||||
# Retry offen gebliebener Ping-Fehlschläge
|
||||
retry_failed_targets "$iface"
|
||||
;;
|
||||
1)
|
||||
log_line "$ICON_DOWN" "Interface $(ci "$iface") ist DOWN – versuche Neustart."
|
||||
restart_with_checks "$iface"
|
||||
;;
|
||||
2)
|
||||
log_line "$ICON_WARN" "Interface $(ci "$iface") existiert nicht"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
sleep "$CHECK_INTERVAL"
|
||||
done
|
||||
}
|
||||
|
||||
cleanup() { log_line "$ICON_WARN" "Thunderbolt Monitor wird beendet"; exit 0; }
|
||||
trap cleanup SIGTERM SIGINT
|
||||
|
||||
case "${1:-}" in
|
||||
"--test")
|
||||
monitor_loop & pid=$!
|
||||
sleep 3
|
||||
kill $pid
|
||||
;;
|
||||
"--clear")
|
||||
: > "$LOG_FILE"; log_line "$ICON_INFO" "Log-Datei geleert";;
|
||||
*)
|
||||
monitor_loop;;
|
||||
esac
|
||||
SCRIPT_EOF
|
||||
|
||||
chmod +x /usr/local/bin/thunderbolt-monitor.sh
|
||||
```
|
||||
|
||||
### 2. systemd Service erstellen
|
||||
|
||||
```bash
|
||||
cat > /etc/systemd/system/thunderbolt-monitor.service << 'EOF'
|
||||
[Unit]
|
||||
Description=🔌 Thunderbolt Interface Monitor
|
||||
After=network.target frr.service
|
||||
Wants=network.target
|
||||
Requires=network.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=root
|
||||
ExecStart=/usr/local/bin/thunderbolt-monitor.sh
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
|
||||
# Umgebungsvariablen
|
||||
Environment=PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
||||
|
||||
# Security Settings
|
||||
NoNewPrivileges=true
|
||||
ProtectSystem=strict
|
||||
ReadWritePaths=/var/log /var/run /sys/class/net
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOF
|
||||
```
|
||||
|
||||
### 3. Service aktivieren und starten
|
||||
|
||||
```bash
|
||||
# Log-Datei vorbereiten
|
||||
touch /var/log/thunderbolt-monitor.log
|
||||
chmod 644 /var/log/thunderbolt-monitor.log
|
||||
|
||||
# Service aktivieren
|
||||
systemctl daemon-reload
|
||||
systemctl enable thunderbolt-monitor
|
||||
systemctl start thunderbolt-monitor
|
||||
```
|
||||
|
||||
## Verwendung
|
||||
|
||||
### Status-Icons im Log
|
||||
|
||||
| Icon | Bedeutung | Level |
|
||||
|------|-----------|--------|
|
||||
| 🔵 | Allgemeine Information | INFO |
|
||||
| ✅ | Erfolgreiche Aktion | SUCCESS |
|
||||
| ❌ | Fehler aufgetreten | ERROR |
|
||||
| ⚠️ | Warnung | WARNING |
|
||||
| 🔄 | Aktion wird ausgeführt | ACTION |
|
||||
| 🟢 | Interface ist UP | UP |
|
||||
| 🔴 | Interface ist DOWN | DOWN |
|
||||
|
||||
### Befehle
|
||||
|
||||
```bash
|
||||
# Service starten
|
||||
systemctl start thunderbolt-monitor
|
||||
|
||||
# Service stoppen
|
||||
systemctl stop thunderbolt-monitor
|
||||
|
||||
# Status prüfen
|
||||
systemctl status thunderbolt-monitor
|
||||
|
||||
# Live-Logs verfolgen
|
||||
tail -f /var/log/thunderbolt-monitor.log
|
||||
|
||||
# journalctl Logs
|
||||
journalctl -u thunderbolt-monitor -f
|
||||
|
||||
# Einmaliger Test
|
||||
/usr/local/bin/thunderbolt-monitor.sh --test
|
||||
|
||||
# Manuell im Vordergrund starten
|
||||
/usr/local/bin/thunderbolt-monitor.sh
|
||||
```
|
||||
|
||||
### Konfiguration anpassen
|
||||
|
||||
```bash
|
||||
# Script bearbeiten
|
||||
nano /usr/local/bin/thunderbolt-monitor.sh
|
||||
|
||||
# Check-Interval ändern (in Sekunden)
|
||||
CHECK_INTERVAL=30
|
||||
|
||||
# Überwachte Interfaces ändern
|
||||
INTERFACES=("thunderbolt0" "thunderbolt1" "thunderbolt2")
|
||||
```
|
||||
|
||||
## Log-Beispiele
|
||||
|
||||
```
|
||||
[2025-08-14 10:59:10] 🔵 Thunderbolt Monitor gestartet (Intervall: 30s)
|
||||
[2025-08-14 10:59:10] 🔵 Eigene Dummy-IP erkannt: 10.0.21.2 (von Pings ausgeschlossen)
|
||||
[2025-08-14 10:59:10] 🔵 Interface thunderbolt0 ist UP - keine Aktion erforderlich.
|
||||
[2025-08-14 10:59:10] 🔵 Interface thunderbolt1 ist UP - keine Aktion erforderlich.
|
||||
[2025-08-14 10:59:40] 🔵 Interface thunderbolt0 ist UP - keine Aktion erforderlich.
|
||||
[2025-08-14 10:59:40] 🔵 Interface thunderbolt1 ist UP - keine Aktion erforderlich.
|
||||
[2025-08-14 11:00:10] 🔵 Interface thunderbolt0 ist UP - keine Aktion erforderlich.
|
||||
[2025-08-14 11:00:10] 🔵 Interface thunderbolt1 ist UP - keine Aktion erforderlich.
|
||||
[2025-08-14 11:00:40] 🔵 Interface thunderbolt0 ist UP - keine Aktion erforderlich.
|
||||
[2025-08-14 11:00:40] ⚠️ Interface thunderbolt1 existiert nicht
|
||||
[2025-08-14 11:01:10] 🔵 Interface thunderbolt0 ist UP - keine Aktion erforderlich.
|
||||
[2025-08-14 11:01:10] 🔴 Interface thunderbolt1 ist DOWN - versuche Neustart.
|
||||
[2025-08-14 11:01:10] 🔄 Starte Interface thunderbolt1 neu...
|
||||
[2025-08-14 11:01:10] 🔵 Interface thunderbolt1 heruntergefahren
|
||||
[2025-08-14 11:01:13] ✅ Interface thunderbolt1 erfolgreich gestartet
|
||||
[2025-08-14 11:01:19] ✅ Ping erfolgreich: 10.0.21.1 über thunderbolt1
|
||||
[2025-08-14 11:01:19] ✅ Ping erfolgreich: 10.0.21.3 über thunderbolt1
|
||||
[2025-08-14 11:01:19] ✅ Interface thunderbolt1 erfolgreich neugestartet
|
||||
[2025-08-14 11:01:49] 🔵 Interface thunderbolt0 ist UP - keine Aktion erforderlich.
|
||||
[2025-08-14 11:01:49] 🔵 Interface thunderbolt1 ist UP - keine Aktion erforderlich.
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Häufige Probleme
|
||||
|
||||
1. **Service startet nicht:**
|
||||
```bash
|
||||
journalctl -u thunderbolt-monitor -o cat -f
|
||||
```
|
||||
|
||||
2. **Interfaces werden nicht erkannt:**
|
||||
```bash
|
||||
ip link show | grep thunderbolt
|
||||
```
|
||||
|
||||
3. **Permissions-Probleme:**
|
||||
```bash
|
||||
ls -la /usr/local/bin/thunderbolt-monitor.sh
|
||||
```
|
||||
|
||||
### Debug-Modus
|
||||
|
||||
```bash
|
||||
# Service stoppen und manuell mit Debug starten
|
||||
systemctl stop thunderbolt-monitor
|
||||
/usr/local/bin/thunderbolt-monitor.sh --test
|
||||
```
|
||||
Reference in New Issue
Block a user