#!/usr/bin/env bash ############################################################################### # XGenStack Agent Manager — Unified Install / Update / Remove # # One command for everything: # # Fresh install: # curl -sSL https://xgenstack.com/agent | bash -s -- install \ # --api-url https://xgenstack.com \ # --enroll-token TOKEN --server-id ID # # Update (auto-detects existing install): # curl -sSL https://xgenstack.com/agent | bash -s -- update # # Remove: # curl -sSL https://xgenstack.com/agent | bash -s -- remove # curl -sSL https://xgenstack.com/agent | bash -s -- remove --purge # # Status: # curl -sSL https://xgenstack.com/agent | bash -s -- status # # Or just: # curl -sSL https://xgenstack.com/agent | bash # (auto-detects: updates if installed, shows help if not) # ############################################################################### set -euo pipefail SCRIPT_VERSION="1.0.2" _BIN="/usr/local/bin/xgs-agent" _UPDATE_BIN="/usr/local/bin/xgs-update" _CONF_DIR="/etc/xgenstack" _DATA="/var/lib/xgs" _LOGS="/var/log/xgenstack" _APPS="/opt/xgs/apps" _SVC="xgs-agent" _SVC_FILE="/etc/systemd/system/xgs-agent.service" _ENV="/etc/xgenstack/agent.env" _VER_FILE="/etc/xgenstack/version" _LOCK="/var/run/xgs-update.lock" _BAK="/tmp/xgs-backup-$$" _PLAT="https://xgenstack.com" _UPD_SVC="/etc/systemd/system/xgs-update.service" _UPD_TMR="/etc/systemd/system/xgs-update.timer" _WD_BIN="/usr/local/bin/xgs-watchdog" _WD_SVC="xgs-watchdog" _WD_SVC_FILE="/etc/systemd/system/xgs-watchdog.service" # --------------------------------------------------------------------------- # Colors & Output # --------------------------------------------------------------------------- RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m' BLUE='\033[0;34m'; CYAN='\033[0;36m'; BOLD='\033[1m' DIM='\033[2m'; NC='\033[0m' info() { echo -e "${BLUE} [INFO]${NC} $*"; } ok() { echo -e "${GREEN} [ OK ]${NC} $*"; } warn() { echo -e "${YELLOW} [WARN]${NC} $*"; } err() { echo -e "${RED} [FAIL]${NC} $*"; } step() { echo -e "\n${BOLD}${CYAN} ▸ $*${NC}"; } banner() { echo "" echo -e "${BOLD}${GREEN} ╔══════════════════════════════════════════╗${NC}" echo -e "${BOLD}${GREEN} ║ XGenStack Agent Manager v${SCRIPT_VERSION} ║${NC}" echo -e "${BOLD}${GREEN} ╚══════════════════════════════════════════╝${NC}" echo "" } die() { err "$*"; exit 1; } # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- require_root() { [ "$(id -u)" -eq 0 ] || die "Must be run as root (use sudo)" } is_installed() { [ -f "${_BIN}" ] && [ -f "${_ENV}" ] } is_running() { systemctl is-active --quiet "${_SVC}" 2>/dev/null } get_current_version() { if [ -f "${_VER_FILE}" ]; then tr -d '[:space:]' < "${_VER_FILE}" 2>/dev/null elif [ -x "${_BIN}" ]; then timeout 3 "${_BIN}" --version 2>/dev/null | grep -oP '[0-9]+\.[0-9]+\.[0-9]+' || echo "unknown" else echo "not installed" fi } get_arch_label() { local arch arch=$(uname -m) case "${arch}" in x86_64) echo "amd64" ;; aarch64) echo "arm64" ;; armv7l) echo "armv7" ;; *) echo "${arch}" ;; esac } get_download_url() { local api_url="${1}" local sys arch sys=$(uname -s | tr '[:upper:]' '[:lower:]') arch=$(get_arch_label) echo "${api_url}/downloads/agent/${sys}-${arch}" } # Load specific vars from agent.env without polluting script namespace load_env() { if [ ! -f "${_ENV}" ]; then return 1 fi # Only extract the vars we need _E_API_URL=$(grep -E '^API_URL=' "${_ENV}" | head -1 | cut -d= -f2-) _E_AGENT_KEY=$(grep -E '^AGENT_KEY=' "${_ENV}" | head -1 | cut -d= -f2-) _E_NODE_ID=$(grep -E '^NODE_ID=' "${_ENV}" | head -1 | cut -d= -f2-) _E_SERVER_ID=$(grep -E '^SERVER_ID=' "${_ENV}" | head -1 | cut -d= -f2-) return 0 } # Validate binary is ELF without requiring 'file' command is_elf_binary() { local path="$1" # Check ELF magic bytes: 0x7f 'E' 'L' 'F' local magic magic=$(xxd -l 4 -p "${path}" 2>/dev/null || od -A n -t x1 -N 4 "${path}" 2>/dev/null | tr -d ' ') [ "${magic}" = "7f454c46" ] } acquire_lock() { if [ -f "${_LOCK}" ]; then local pid pid=$(cat "${_LOCK}" 2>/dev/null || echo "") if [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then die "Another operation is running (PID ${pid}). Wait or remove ${_LOCK}" fi rm -f "${_LOCK}" fi echo $$ > "${_LOCK}" } release_lock() { rm -f "${_LOCK}" 2>/dev/null || true } cleanup() { release_lock rm -rf "${_BAK}" 2>/dev/null || true } trap cleanup EXIT # --------------------------------------------------------------------------- # COMMAND: status # --------------------------------------------------------------------------- cmd_status() { banner if ! is_installed; then info "Agent is ${RED}not installed${NC}" echo "" echo " Install with:" echo " curl -sSL ${_PLAT}/agent | bash -s -- install \\" echo " --api-url ${_PLAT} \\" echo " --enroll-token TOKEN --server-id ID" echo "" return 0 fi local ver ver=$(get_current_version) echo -e " ${CYAN}Version:${NC} ${ver}" echo -e " ${CYAN}Binary:${NC} ${_BIN}" echo -e " ${CYAN}Config:${NC} ${_ENV}" if is_running; then echo -e " ${CYAN}Status:${NC} ${GREEN}running${NC}" local pid pid=$(systemctl show -p MainPID --value "${_SVC}" 2>/dev/null || echo "?") echo -e " ${CYAN}PID:${NC} ${pid}" local uptime_val uptime_val=$(systemctl show -p ActiveEnterTimestamp --value "${_SVC}" 2>/dev/null || echo "?") echo -e " ${CYAN}Since:${NC} ${uptime_val}" else echo -e " ${CYAN}Status:${NC} ${RED}stopped${NC}" fi if load_env 2>/dev/null; then echo -e " ${CYAN}API URL:${NC} ${_E_API_URL:-unknown}" echo -e " ${CYAN}Node ID:${NC} ${_E_NODE_ID:-unknown}" echo -e " ${CYAN}Server ID:${NC} ${_E_SERVER_ID:-unknown}" fi # Check watchdog if systemctl is-active --quiet xgs-watchdog 2>/dev/null; then echo -e " ${CYAN}Watchdog:${NC} ${GREEN}active${NC}" else echo -e " ${CYAN}Watchdog:${NC} ${YELLOW}inactive${NC}" fi # Check update timer if systemctl is-active --quiet xgs-update.timer 2>/dev/null; then local next next=$(systemctl show -p NextElapseUSecRealtime --value xgs-update.timer 2>/dev/null || echo "?") echo -e " ${CYAN}Auto-update:${NC} active (next: ${next})" else echo -e " ${CYAN}Auto-update:${NC} ${YELLOW}inactive${NC}" fi echo "" echo -e " ${DIM}Logs: journalctl -u ${_SVC} -f${NC}" echo "" } # --------------------------------------------------------------------------- # COMMAND: update # --------------------------------------------------------------------------- cmd_update() { require_root banner if ! is_installed; then die "Agent is not installed. Use 'install' command first." fi acquire_lock load_env || die "Cannot load ${_ENV}" local api_url="${_E_API_URL:-}" [ -n "${api_url}" ] || die "API_URL not set in ${_ENV}" local current_ver current_ver=$(get_current_version) step "Current version: ${current_ver}" # --- Check for new version --- step "Checking for updates..." local agent_key="${_E_AGENT_KEY:-}" local node_id="${_E_NODE_ID:-}" api_url="${api_url%/}" local latest_ver="" local download_url="" local checksum="" # Try the version endpoint local ver_resp="" ver_resp=$(curl -sS --connect-timeout 10 --max-time 30 \ -H "X-Agent-Key: ${agent_key}" \ -H "X-Node-ID: ${node_id}" \ "${api_url}/api/v1/agents/version" 2>/dev/null) || true if [ -n "${ver_resp}" ]; then latest_ver=$(echo "${ver_resp}" | jq -r '.data.version // .version // empty' 2>/dev/null || true) download_url=$(echo "${ver_resp}" | jq -r '.data.download_url // .download_url // empty' 2>/dev/null || true) checksum=$(echo "${ver_resp}" | jq -r '.data.checksum // .checksum // empty' 2>/dev/null || true) fi if [ -z "${latest_ver}" ]; then warn "Could not determine latest version from API. Forcing update from binary download." latest_ver="latest" fi local force="${FORCE:-false}" if [ "${force}" = false ] && [ "${latest_ver}" != "latest" ] && [ "${latest_ver}" = "${current_ver}" ]; then ok "Already up to date (${current_ver}). Use --force to re-download." return 0 fi info "Updating: ${current_ver} → ${latest_ver}" # --- Download new binary --- if [ -z "${download_url}" ]; then download_url=$(get_download_url "${api_url}") fi step "Downloading new binary..." local tmp_bin="/tmp/xgs-agent-new-$$" # Default curl progress meter shows percent, total size, downloaded # bytes, average + current speed, and ETA — written to stderr so it # stays visible even when stdout is captured. After the download # finishes we print our own summary line with size + avg speed + time # taken from -w, formatted in MB / MB/s for readability. local http_code size_bytes speed_bps time_total if [ -t 2 ]; then # TTY: let curl's progress meter render to stderr. local stats stats=$(curl -fS \ -o "${tmp_bin}" \ --connect-timeout 10 --max-time 120 \ -H "X-Agent-Key: ${agent_key}" \ -H "X-Node-ID: ${node_id}" \ -w '%{http_code} %{size_download} %{speed_download} %{time_total}' \ "${download_url}" 2>&2 || echo "000 0 0 0") read -r http_code size_bytes speed_bps time_total <<< "${stats}" else # Non-TTY (piped/systemd): silent, single capture. local stats stats=$(curl -fsS \ -o "${tmp_bin}" \ --connect-timeout 10 --max-time 120 \ -H "X-Agent-Key: ${agent_key}" \ -H "X-Node-ID: ${node_id}" \ -w '%{http_code} %{size_download} %{speed_download} %{time_total}' \ "${download_url}" || echo "000 0 0 0") read -r http_code size_bytes speed_bps time_total <<< "${stats}" fi # Print formatted summary regardless of TTY (helpful in logs too). if [ -n "${size_bytes}" ] && [ "${size_bytes}" != "0" ]; then local sz_mb spd_mbs sz_mb=$(awk "BEGIN{printf \"%.2f\", ${size_bytes}/1048576}") spd_mbs=$(awk "BEGIN{printf \"%.2f\", ${speed_bps}/1048576}") echo -e " ${DIM}${sz_mb} MB at ${spd_mbs} MB/s in ${time_total}s${NC}" fi if [ "${http_code}" != "200" ] || [ ! -s "${tmp_bin}" ]; then rm -f "${tmp_bin}" die "Download failed (HTTP ${http_code}) from ${download_url}" fi local size size_human size=$(wc -c < "${tmp_bin}") if [ "${size}" -ge 1048576 ]; then size_human="$((size / 1048576)) MB ($(printf "%'d" "${size}") bytes)" else size_human="$(printf "%'d" "${size}") bytes" fi ok "Downloaded ${size_human}" # --- Verify checksum --- if [ -n "${checksum}" ]; then step "Verifying checksum..." local actual actual=$(sha256sum "${tmp_bin}" | awk '{print $1}') if [ "${actual}" != "${checksum}" ]; then rm -f "${tmp_bin}" die "Checksum mismatch! Expected: ${checksum} Got: ${actual}" fi ok "Checksum verified" fi # --- Verify it's a valid binary --- chmod +x "${tmp_bin}" if ! is_elf_binary "${tmp_bin}"; then rm -f "${tmp_bin}" die "Downloaded file is not a valid ELF binary" fi ok "Binary validated (ELF)" # --- Backup current binary --- step "Backing up current binary..." mkdir -p "${_BAK}" if [ -f "${_BIN}" ]; then cp "${_BIN}" "${_BAK}/xgs-agent.bak" ok "Backed up to ${_BAK}/xgs-agent.bak" fi if [ -f "${_VER_FILE}" ]; then cp "${_VER_FILE}" "${_BAK}/version.bak" fi # --- Stop service --- step "Stopping agent..." systemctl stop "${_SVC}" 2>/dev/null || true sleep 1 # Double-check it's actually stopped (kill lingering process) local old_pid="" old_pid=$(systemctl show -p MainPID --value "${_SVC}" 2>/dev/null || echo "0") if [ "${old_pid}" != "0" ] && [ -n "${old_pid}" ] && kill -0 "${old_pid}" 2>/dev/null; then warn "Service still running (PID ${old_pid}), sending SIGKILL..." kill -9 "${old_pid}" 2>/dev/null || true sleep 1 fi ok "Agent stopped" # --- Replace binary --- step "Installing new binary..." mv "${tmp_bin}" "${_BIN}" chmod +x "${_BIN}" ok "Binary replaced" # --- Update version file --- if [ "${latest_ver}" != "latest" ]; then echo "${latest_ver}" > "${_VER_FILE}" else echo "0.0.0" > "${_VER_FILE}" fi # --- Reinstall auto-update timer (keeps it current) --- step "Updating auto-update timer..." install_update_timer "${api_url}" # --- Update watchdog --- step "Updating watchdog..." install_watchdog "${api_url}" # --- Install Nixpacks (v2.5 Tier 1 detector) — managed only --- # Read the posture from the existing agent.env. Missing ⇒ managed (matches # the agent binary), so existing managed boxes keep nixpacks on update; # observe boxes never get it. local _mode _mode=$(grep -E '^XGS_AGENT_PROFILE=' "${_ENV}" 2>/dev/null | head -1 | cut -d= -f2-) [ -n "${_mode}" ] || _mode=$(grep -E '^XGS_AGENT_MODE=' "${_ENV}" 2>/dev/null | head -1 | cut -d= -f2-) if [ "${_mode}" != "observe" ]; then step "Ensuring nixpacks is installed..." install_nixpacks fi # Ollama/Phi-3 RCA model install removed — too heavy for typical VPS. # RCA uses the agent's built-in rule-based engine. # --- Start service --- step "Starting agent..." systemctl start "${_SVC}" # --- Health check --- step "Health check..." local retries=5 local healthy=false for i in $(seq 1 ${retries}); do sleep 2 if systemctl is-active --quiet "${_SVC}"; then healthy=true break fi warn "Attempt ${i}/${retries}: not running yet..." done if [ "${healthy}" = true ]; then ok "Agent is running!" local new_ver new_ver=$(get_current_version) echo "" echo -e " ${GREEN}${BOLD}Update successful: ${current_ver} → ${new_ver}${NC}" echo -e " ${DIM}Logs: journalctl -u ${_SVC} -f${NC}" echo "" # Clean up backup rm -rf "${_BAK}" else # --- ROLLBACK --- err "Agent failed to start after update!" echo "" if [ -f "${_BAK}/xgs-agent.bak" ]; then step "Rolling back to previous version..." mv "${_BAK}/xgs-agent.bak" "${_BIN}" chmod +x "${_BIN}" if [ -f "${_BAK}/version.bak" ]; then cp "${_BAK}/version.bak" "${_VER_FILE}" fi systemctl start "${_SVC}" 2>/dev/null || true sleep 2 if systemctl is-active --quiet "${_SVC}"; then ok "Rollback successful. Running previous version." else err "Rollback failed! Agent is not running." err "Manual fix needed: journalctl -u ${_SVC} --no-pager -n 50" fi else err "No backup available for rollback." err "Manual fix needed: journalctl -u ${_SVC} --no-pager -n 50" fi exit 1 fi } # --------------------------------------------------------------------------- # COMMAND: install # --------------------------------------------------------------------------- cmd_install() { require_root banner # Parse install-specific arguments. # --mode: observe (default) = pure sensor, minimal footprint, no host # mutation; managed = full provisioning (build deps, nixpacks, startup # self-heal/firewall/vhosts). See docs/observe-mode-plan. local api_url="" enroll_token="" server_id="" force=false mode="observe" while [ $# -gt 0 ]; do case "$1" in --api-url=*) api_url="${1#*=}"; shift ;; --api-url) api_url="${2:-}"; shift 2 ;; --enroll-token=*) enroll_token="${1#*=}"; shift ;; --enroll-token) enroll_token="${2:-}"; shift 2 ;; --token=*) enroll_token="${1#*=}"; shift ;; --token) enroll_token="${2:-}"; shift 2 ;; --server-id=*) server_id="${1#*=}"; shift ;; --server-id) server_id="${2:-}"; shift 2 ;; --profile=*) mode="${1#*=}"; shift ;; --profile) mode="${2:-}"; shift 2 ;; --mode=*) mode="${1#*=}"; shift ;; --mode) mode="${2:-}"; shift 2 ;; --managed) mode="managed"; shift ;; --observe) mode="observe"; shift ;; --force) force=true; shift ;; *) shift ;; esac done # Normalise mode — anything other than the literal "managed" is observe. [ "${mode}" = "managed" ] || mode="observe" # Validate [ -n "${api_url}" ] || die "Missing --api-url" [ -n "${enroll_token}" ] || die "Missing --enroll-token" [ -n "${server_id}" ] || die "Missing --server-id" api_url="${api_url%/}" # Fetch the platform's "latest agent version" pointer so we can # write the correct value into /etc/xgenstack/version AND report # it during enrollment. Without this the script hard-coded 0.3.0 # and every fresh install reported as ancient. LATEST_AGENT_VER=$(curl -sS --connect-timeout 5 --max-time 10 \ -H "Accept: application/json" \ "${api_url}/api/v1/agents/version" 2>/dev/null \ | jq -r '.data.version // .version // empty' 2>/dev/null || echo "") if [ -z "${LATEST_AGENT_VER}" ]; then warn "Could not fetch latest version from platform — defaulting to 0.0.0 (heartbeat will trigger an upgrade)" LATEST_AGENT_VER="0.0.0" fi # Check existing install if is_installed && [ "${force}" = false ]; then if is_running; then warn "Agent is already installed and running." warn "To update: curl -sSL ${_PLAT}/agent | bash -s -- update" warn "To reinstall: add --force" exit 0 fi fi # Clean any previous install if is_installed || [ -f "${_BIN}" ]; then step "Cleaning previous installation..." systemctl stop "${_SVC}" 2>/dev/null || true systemctl disable "${_SVC}" 2>/dev/null || true rm -f "${_BIN}" "${_UPDATE_BIN}" rm -f "${_SVC_FILE}" "${_UPD_SVC}" "${_UPD_TMR}" rm -f "${_LOCK}" systemctl daemon-reload 2>/dev/null || true ok "Previous install cleaned" fi acquire_lock # --- Detect OS --- step "Detecting system..." local os="" os_ver="" arch="" if [ -f /etc/os-release ]; then # shellcheck source=/dev/null . /etc/os-release os="${ID}" os_ver="${VERSION_ID}" else die "Cannot detect OS" fi arch=$(uname -m) ok "${os} ${os_ver} (${arch})" # --- Install prerequisites --- # observe mode: install ONLY the tiny tools the agent itself needs # (curl/jq/ca-certificates). No compiler toolchain, no image libs — # nothing that mutates a production box beyond the agent. # managed mode: ALSO install the native dev libraries that popular Node # packages need at deploy time (sharp/canvas/better-sqlite3/node-postgres # native), removing the #2 deploy-failure class (W1.1). ~150 MB one-time. if [ "${mode}" = "managed" ]; then step "Installing prerequisites + native build deps (managed)..." if command -v apt-get &>/dev/null; then DEBIAN_FRONTEND=noninteractive apt-get update -qq 2>/dev/null DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \ curl jq ca-certificates git \ build-essential pkg-config python3 \ libvips-dev libcairo2-dev libpango1.0-dev \ libjpeg-dev libgif-dev librsvg2-dev \ libsqlite3-dev libpq-dev \ 2>/dev/null || warn "some native deps may not have installed (non-fatal; deploys will retry)" elif command -v dnf &>/dev/null; then dnf install -y -q \ curl jq ca-certificates git \ gcc gcc-c++ make pkgconf-pkg-config python3 \ vips-devel cairo-devel pango-devel \ libjpeg-turbo-devel giflib-devel librsvg2-devel \ sqlite-devel libpq-devel \ 2>/dev/null || warn "some native deps may not have installed" elif command -v yum &>/dev/null; then yum install -y -q curl jq ca-certificates 2>/dev/null fi ok "Prerequisites + native build deps ready" else step "Installing minimal prerequisites (observe — curl, jq, ca-certificates)..." if command -v apt-get &>/dev/null; then DEBIAN_FRONTEND=noninteractive apt-get install -y -qq curl jq ca-certificates 2>/dev/null || warn "prereq install had warnings (non-fatal)" elif command -v dnf &>/dev/null; then dnf install -y -q curl jq ca-certificates 2>/dev/null || true elif command -v yum &>/dev/null; then yum install -y -q curl jq ca-certificates 2>/dev/null || true fi ok "Minimal prerequisites ready (observe mode — no build toolchain installed)" fi # --- Create directories --- step "Creating directories..." for dir in "${_CONF_DIR}" "${_DATA}" "${_LOGS}" "${_APPS}"; do mkdir -p "${dir}" done chmod 750 "${_CONF_DIR}" "${_DATA}" "${_LOGS}" ok "Directories ready" # --- Download binary --- step "Downloading agent binary..." local download_url download_url=$(get_download_url "${api_url}") local tmp_bin="/tmp/xgs-agent-download-$$" local http_code http_code=$(curl -sS -w '%{http_code}' -o "${tmp_bin}" \ --connect-timeout 15 --max-time 120 \ "${download_url}" 2>/dev/null || echo "000") if [ "${http_code}" != "200" ] || [ ! -s "${tmp_bin}" ]; then rm -f "${tmp_bin}" die "Download failed (HTTP ${http_code}) from ${download_url}" fi chmod +x "${tmp_bin}" if ! is_elf_binary "${tmp_bin}"; then rm -f "${tmp_bin}" die "Downloaded file is not a valid ELF binary" fi mv "${tmp_bin}" "${_BIN}" chmod +x "${_BIN}" local size size=$(wc -c < "${_BIN}") ok "Installed ${_BIN} (${size} bytes)" # --- Enroll with platform --- step "Enrolling with platform..." local hostname_val ip_val hostname_val="$(hostname)" ip_val="$(curl -s --connect-timeout 5 ifconfig.me 2>/dev/null || hostname -I 2>/dev/null | awk '{print $1}' || echo 'unknown')" local enroll_resp enroll_resp=$(curl -sS --connect-timeout 10 --max-time 30 \ -X POST "${api_url}/api/v1/agents/enroll" \ -H "Content-Type: application/json" \ -d "{ \"server_id\": \"${server_id}\", \"token\": \"${enroll_token}\", \"hostname\": \"${hostname_val}\", \"ip_address\": \"${ip_val}\", \"os\": \"${os}\", \"os_version\": \"${os_ver}\", \"arch\": \"${arch}\", \"agent_version\": \"${LATEST_AGENT_VER}\" }") || die "Failed to connect to ${api_url}" local node_id agent_key hmac_secret node_id=$(echo "${enroll_resp}" | jq -r '.data.node_id // empty') agent_key=$(echo "${enroll_resp}" | jq -r '.data.agent_key // empty') hmac_secret=$(echo "${enroll_resp}" | jq -r '.data.hmac_secret // empty') if [ -z "${node_id}" ] || [ -z "${agent_key}" ]; then err "Enrollment failed:" echo "${enroll_resp}" | jq . 2>/dev/null || echo "${enroll_resp}" exit 1 fi ok "Enrolled (Node: ${node_id})" # Save certs if provided local cert_pem key_pem cert_pem=$(echo "${enroll_resp}" | jq -r '.data.cert_pem // empty') key_pem=$(echo "${enroll_resp}" | jq -r '.data.key_pem // empty') if [ -n "${cert_pem}" ] && [ -n "${key_pem}" ]; then echo "${cert_pem}" > "${_DATA}/node.crt" echo "${key_pem}" > "${_DATA}/node.key" chmod 600 "${_DATA}/node.key" fi # --- Write config --- step "Writing configuration..." cat > "${_ENV}" < "${_VER_FILE}" ok "Config written to ${_ENV}" # --- Install systemd service --- step "Installing systemd service..." cat > "${_SVC_FILE}" <<'SVCEOF' [Unit] Description=XGenStack Agent Documentation=https://xgenstack.com/docs/ After=network-online.target Wants=network-online.target [Service] # Type=notify enables the systemd watchdog protocol. The agent's # StartSystemdWatchdog goroutine sends sd_notify(READY=1) once the WS connect # loop is up, then pings WATCHDOG=1 every WATCHDOG_USEC/2. If a ping is # missed (deadlock, infinite loop, exhausted goroutine pool), systemd # kills + respawns us. Type=notify NotifyAccess=main WatchdogSec=30s EnvironmentFile=/etc/xgenstack/agent.env ExecStart=/usr/local/bin/xgs-agent # Standard restart policy. On crash-loop (>5 fails in 60s) systemd holds # off and the separate xgs-watchdog binary kicks in (see scripts/xgs-watchdog.sh). Restart=always RestartSec=5 StartLimitInterval=60 StartLimitBurst=5 # Resource isolation. The agent itself is small at steady state (~30MB), # but during a build it streams turbo/pnpm/next output through bufio # scanners + WS-send buffers and the in-process line counter — easily # 200-400MB transient. 512M MemoryMax killed the agent mid-build on # OpenStatus (2026-05-14), which in turn killed the systemd-run scope # hosting the build → "exit 1 with no error". 2GB is generous headroom. # Build children run in their OWN systemd-run scope under xgs-build.slice # (sibling, not child) so their memory does NOT count against this cap. MemoryHigh=1G MemoryMax=2G CPUQuota=200% TasksMax=infinity LimitNOFILE=65536 LimitNPROC=4096 # Sandbox hardening. The agent runs as root by design (apt, systemctl, # nginx vhosts, certbot, docker, iptables). NoNewPrivileges=true prevents # child processes gaining extra capabilities via setuid binaries. # ProtectSystem=strict makes /usr /boot /efi read-only; explicit # ReadWritePaths re-opens only what the agent actually writes to. # ProtectHome=false: the file manager lets users manage /root and /home. # ProtectControlGroups=false: docker and systemctl need cgroup access. # # Each ReadWritePaths entry is prefixed with `-` (tolerate-if-absent). On a # minimal fresh VPS the heavy dirs (/etc/nginx, /etc/letsencrypt, /etc/fail2ban, # /etc/cron.*) don't exist until those components are lazy-installed on first # use — and systemd fails the ENTIRE namespace setup (exit 226/NAMESPACE, # agent never starts) if any listed path is missing. The `-` makes systemd # silently skip absent paths and re-include them after they're created. NoNewPrivileges=true ProtectSystem=strict ReadWritePaths=-/var/lib/xgs -/usr/local/bin -/tmp -/etc/nginx -/etc/letsencrypt -/etc/systemd -/etc/xgenstack -/etc/fail2ban -/etc/ssh -/etc/cron.d -/etc/cron.daily PrivateTmp=true RestrictSUIDSGID=true ProtectHome=false ProtectKernelModules=true ProtectControlGroups=false StandardOutput=journal StandardError=journal SyslogIdentifier=xgs-agent [Install] WantedBy=multi-user.target SVCEOF systemctl daemon-reload systemctl enable "${_SVC}" 2>/dev/null ok "Service installed" # --- Install auto-update timer --- step "Installing auto-update timer..." install_update_timer "${api_url}" # --- Install watchdog --- step "Installing watchdog service..." install_watchdog "${api_url}" # --- Install Nixpacks (v2.5 Tier 1 detector) — managed only --- if [ "${mode}" = "managed" ]; then step "Installing Nixpacks build detector..." install_nixpacks fi # Ollama/Phi-3 RCA model install removed — too heavy for typical VPS. # RCA uses the agent's built-in rule-based engine. # --- Start agent --- step "Starting agent..." systemctl start "${_SVC}" # Health check local retries=5 healthy=false for i in $(seq 1 ${retries}); do sleep 2 if is_running; then healthy=true break fi warn "Waiting... (${i}/${retries})" done if [ "${healthy}" = true ]; then ok "Agent is running!" else err "Agent failed to start. Check: journalctl -u ${_SVC} --no-pager -n 30" exit 1 fi # --- Summary --- echo "" echo -e " ${BOLD}${GREEN}╔══════════════════════════════════════════╗${NC}" echo -e " ${BOLD}${GREEN}║ Installation Complete! ║${NC}" echo -e " ${BOLD}${GREEN}╚══════════════════════════════════════════╝${NC}" echo "" echo -e " ${CYAN}Node ID:${NC} ${node_id}" echo -e " ${CYAN}Server:${NC} ${server_id}" echo -e " ${CYAN}IP:${NC} ${ip_val}" echo -e " ${CYAN}OS:${NC} ${os} ${os_ver} (${arch})" echo -e " ${CYAN}Version:${NC} ${LATEST_AGENT_VER}" echo "" echo -e " ${DIM}Commands:${NC}" echo -e " ${DIM}systemctl status ${_SVC}${NC}" echo -e " ${DIM}journalctl -u ${_SVC} -f${NC}" echo -e " ${DIM}curl -sSL ${_PLAT}/agent | bash -s -- update${NC}" echo "" } # --------------------------------------------------------------------------- # COMMAND: remove # --------------------------------------------------------------------------- cmd_remove() { require_root banner local purge=false yes=false while [ $# -gt 0 ]; do case "$1" in --purge) purge=true; shift ;; --yes|-y) yes=true; shift ;; *) shift ;; esac done if ! is_installed && [ ! -f "${_BIN}" ] && [ ! -f "${_SVC_FILE}" ]; then info "Agent is not installed. Nothing to remove." return 0 fi if [ "${yes}" = false ]; then echo -e " ${BOLD}This will remove the XGenStack agent.${NC}" if [ "${purge}" = true ]; then echo -e " ${YELLOW}--purge: ALL config, data, and logs will be deleted.${NC}" fi echo "" echo -n " Continue? [y/N] " read -r confirm case "${confirm}" in [yY]|[yY][eE][sS]) ;; *) info "Cancelled."; exit 0 ;; esac fi # Stop services step "Stopping services..." systemctl stop "${_WD_SVC}" 2>/dev/null || true systemctl disable "${_WD_SVC}" 2>/dev/null || true systemctl stop xgs-update.timer 2>/dev/null || true systemctl disable xgs-update.timer 2>/dev/null || true systemctl stop "${_SVC}" 2>/dev/null || true systemctl disable "${_SVC}" 2>/dev/null || true # Kill any lingering process local pid="" pid=$(pgrep -f "xgs-agent" 2>/dev/null || true) if [ -n "${pid}" ]; then kill -9 ${pid} 2>/dev/null || true sleep 1 fi ok "Services stopped" # Remove unit files step "Removing systemd units..." rm -f "${_SVC_FILE}" "${_UPD_SVC}" "${_UPD_TMR}" "${_WD_SVC_FILE}" systemctl daemon-reload ok "Units removed" # Remove binaries step "Removing binaries..." rm -f "${_BIN}" "${_UPDATE_BIN}" "${_WD_BIN}" ok "Binaries removed" # Remove logs step "Removing logs..." rm -rf "${_LOGS}" ok "Logs removed" # Remove lock rm -f "${_LOCK}" if [ "${purge}" = true ]; then step "Purging config and data..." rm -rf "${_CONF_DIR}" "${_DATA}" if [ -d "/opt/xgs" ]; then rm -rf "/opt/xgs" fi ok "Config and data purged" else info "Config preserved at ${_CONF_DIR} (use --purge to remove)" fi echo "" echo -e " ${BOLD}${GREEN}Agent removed.${NC}" echo "" } # --------------------------------------------------------------------------- # Install watchdog service (independent agent monitor) # --------------------------------------------------------------------------- install_watchdog() { local api_url="${1}" api_url="${api_url%/}" # Download watchdog script from platform local tmp_wd="/tmp/xgs-watchdog-$$" local http_code http_code=$(curl -sS -w '%{http_code}' -o "${tmp_wd}" \ --connect-timeout 10 --max-time 30 \ "${api_url}/downloads/xgs-watchdog" 2>/dev/null || echo "000") if [ "${http_code}" != "200" ] || [ ! -s "${tmp_wd}" ]; then rm -f "${tmp_wd}" warn "Watchdog download failed (HTTP ${http_code}), skipping" return 0 fi mv "${tmp_wd}" "${_WD_BIN}" chmod +x "${_WD_BIN}" # Install systemd service cat > "${_WD_SVC_FILE}" <<'WDEOF' [Unit] Description=XGenStack Agent Watchdog After=network-online.target Wants=xgs-agent.service [Service] Type=simple ExecStart=/usr/local/bin/xgs-watchdog Restart=always RestartSec=10 StandardOutput=journal StandardError=journal SyslogIdentifier=xgs-watchdog [Install] WantedBy=multi-user.target WDEOF systemctl daemon-reload systemctl enable "${_WD_SVC}" 2>/dev/null systemctl restart "${_WD_SVC}" 2>/dev/null ok "Watchdog installed" } # --------------------------------------------------------------------------- # Install auto-update timer + self-updating update script # --------------------------------------------------------------------------- install_nixpacks() { # Nixpacks is the v2.5 Tier 1 detector — Railway's open-source build # planner that handles ~70% of Node/Python/Rust/Go apps automatically. # We only call `nixpacks plan` (NOT `nixpacks build`), so no Nix # runtime needed; we just steal the install/build/start commands. # # SECURITY: hardcoded version + SHA-256. Do NOT read NIXPACKS_VERSION # from the environment — any env-var-based injection would let a # compromised agent.env override the pinned binary. local NIX_BIN="/usr/local/bin/nixpacks" local NIX_VER="1.41.0" # Pinned SHAs — must be updated in tandem with NIX_VER above AND with # backend/agent/nixpacks/nixpacks.go's PinnedSHA256. local NIX_SHA_X86_64="0f55de7874507b9cf7502113120bd96f2ab6979f78d10eaf2eb2ade9207b3af6" local NIX_SHA_AARCH64="912bd02dd2bb6f9c3a9ed965fe8a68b4aa318dc7a2546e2eca6f2806a894ba39" # Skip if already at expected version. Saves ~50 MB of redundant # downloads on every agent upgrade. if [ -x "${NIX_BIN}" ]; then local CUR_VER CUR_VER=$("${NIX_BIN}" --version 2>/dev/null | awk '{print $NF}' || echo "") if [ "${CUR_VER}" = "${NIX_VER}" ]; then ok "Nixpacks ${NIX_VER} already installed" return 0 fi fi local arch_tag expected_sha case "$(uname -m)" in x86_64|amd64) arch_tag="x86_64"; expected_sha="${NIX_SHA_X86_64}" ;; aarch64|arm64) arch_tag="aarch64"; expected_sha="${NIX_SHA_AARCH64}" ;; *) warn "Unsupported arch for nixpacks: $(uname -m); skipping"; return 0 ;; esac local url="https://github.com/railwayapp/nixpacks/releases/download/v${NIX_VER}/nixpacks-v${NIX_VER}-${arch_tag}-unknown-linux-musl.tar.gz" local tmp="/tmp/nixpacks-$$.tar.gz" local http_code http_code=$(curl -fsSL -o "${tmp}" -w '%{http_code}' \ --connect-timeout 10 --max-time 90 \ "${url}" 2>/dev/null || echo "000") if [ "${http_code}" != "200" ] || [ ! -s "${tmp}" ]; then rm -f "${tmp}" warn "Nixpacks download failed (HTTP ${http_code}); detector will be skipped" return 0 fi # SECURITY: verify SHA-256 before extracting / installing. # A compromised CDN or MITM bypass cannot deliver a trojan binary. local got_sha got_sha=$(sha256sum "${tmp}" | awk '{print $1}') if [ "${got_sha}" != "${expected_sha}" ]; then rm -f "${tmp}" warn "Nixpacks SHA-256 mismatch (got=${got_sha} expected=${expected_sha}) — refusing to install possibly tampered binary" return 0 fi # Extract to a per-PID dir so concurrent installs don't collide. local extract_dir="/tmp/nixpacks-extract-$$" rm -rf "${extract_dir}" mkdir -p "${extract_dir}" if ! tar -xzf "${tmp}" -C "${extract_dir}" 2>/dev/null; then rm -rf "${extract_dir}" "${tmp}" warn "Nixpacks extraction failed; skipping" return 0 fi local found found=$(find "${extract_dir}" -name nixpacks -type f | head -1) if [ -z "${found}" ]; then rm -rf "${extract_dir}" "${tmp}" warn "Nixpacks binary not found in tarball; skipping" return 0 fi # Atomic install: write to .tmp then rename — avoids the partial-write # window where the file exists but is truncated. local stage="${NIX_BIN}.tmp.$$" if ! install -m 0755 "${found}" "${stage}" 2>/dev/null; then rm -rf "${extract_dir}" "${tmp}" "${stage}" warn "Nixpacks install (stage) failed; skipping" return 0 fi if ! mv -f "${stage}" "${NIX_BIN}" 2>/dev/null; then rm -rf "${extract_dir}" "${tmp}" "${stage}" warn "Nixpacks install (rename) failed; skipping" return 0 fi rm -rf "${extract_dir}" "${tmp}" if "${NIX_BIN}" --version >/dev/null 2>&1; then ok "Nixpacks ${NIX_VER} installed at ${NIX_BIN}" else warn "Nixpacks installed but --version failed; detector may not work" fi } install_ollama_rca() { # DISABLED — the local LLM (Ollama + Phi-3-mini, ~2.4 GB) was far too heavy # for a typical VPS. RCA now relies solely on the agent's built-in # rule-based engine. Kept as a no-op so any stray caller is harmless. return 0 } install_update_timer() { local api_url="${1}" # Write the update script inline (so it's always current) cat > "${_UPDATE_BIN}" <<'UPDATEEOF' #!/usr/bin/env bash # XGenStack Agent Auto-Update — invoked by systemd timer set -euo pipefail exec 2>&1 ULOG="/var/log/xgenstack/updates.log" mkdir -p /var/log/xgenstack ulog() { echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) [$1] $2" | tee -a "$ULOG"; } [ "$(id -u)" -eq 0 ] || { ulog ERROR "Must be root"; exit 1; } [ -f /etc/xgenstack/agent.env ] || { ulog ERROR "No agent.env"; exit 1; } # Extract vars safely U_API_URL=$(grep -E '^API_URL=' /etc/xgenstack/agent.env | head -1 | cut -d= -f2-) U_AGENT_KEY=$(grep -E '^AGENT_KEY=' /etc/xgenstack/agent.env | head -1 | cut -d= -f2-) U_NODE_ID=$(grep -E '^NODE_ID=' /etc/xgenstack/agent.env | head -1 | cut -d= -f2-) [ -n "${U_API_URL:-}" ] || { ulog ERROR "No API_URL"; exit 1; } [ -n "${U_AGENT_KEY:-}" ] || { ulog ERROR "No AGENT_KEY"; exit 1; } U_API_URL="${U_API_URL%/}" UCURRENT=$(tr -d '[:space:]' < /etc/xgenstack/version 2>/dev/null || echo "0.0.0") ulog INFO "Current version: $UCURRENT" # Check latest URESP=$(curl -sS --connect-timeout 10 --max-time 30 \ -H "X-Agent-Key: ${U_AGENT_KEY}" -H "X-Node-ID: ${U_NODE_ID:-}" \ "${U_API_URL}/api/v1/agents/version" 2>/dev/null) || { ulog ERROR "Cannot reach API"; exit 1; } ULATEST=$(echo "$URESP" | jq -r '.data.version // .version // empty' 2>/dev/null) [ -n "$ULATEST" ] || { ulog INFO "No version info from API"; exit 0; } ulog INFO "Latest version: $ULATEST" # Compare if [ "$ULATEST" = "$UCURRENT" ]; then ulog INFO "Already up to date" exit 0 fi ulog INFO "Updating $UCURRENT -> $ULATEST" # Download UARCH=$(uname -m); USYS=$(uname -s | tr '[:upper:]' '[:lower:]') case "$UARCH" in x86_64) UAL=amd64;; aarch64) UAL=arm64;; *) UAL=$UARCH;; esac UDL_URL=$(echo "$URESP" | jq -r '.data.download_url // empty' 2>/dev/null) [ -n "$UDL_URL" ] || UDL_URL="${U_API_URL}/downloads/agent/${USYS}-${UAL}" UTMP="/tmp/xgs-agent-update-$$" UBAK="/tmp/xgs-agent-backup-$$" trap 'rm -f "$UTMP"' EXIT UHTTP=$(curl -sS -w '%{http_code}' -o "$UTMP" --connect-timeout 10 --max-time 120 \ -H "X-Agent-Key: ${U_AGENT_KEY}" -H "X-Node-ID: ${U_NODE_ID:-}" "$UDL_URL" 2>/dev/null || echo 000) if [ "$UHTTP" != "200" ] || [ ! -s "$UTMP" ]; then ulog ERROR "Download failed (HTTP $UHTTP)" exit 1 fi ulog INFO "Downloaded $(wc -c < "$UTMP") bytes" # Checksum UCHK=$(echo "$URESP" | jq -r '.data.checksum // empty' 2>/dev/null) if [ -n "$UCHK" ]; then UACT=$(sha256sum "$UTMP" | awk '{print $1}') if [ "$UACT" != "$UCHK" ]; then ulog ERROR "Checksum mismatch" exit 1 fi ulog INFO "Checksum OK" fi # Backup + replace cp /usr/local/bin/xgs-agent "$UBAK" 2>/dev/null || true systemctl stop xgs-agent 2>/dev/null || true sleep 1 mv "$UTMP" /usr/local/bin/xgs-agent chmod +x /usr/local/bin/xgs-agent echo "$ULATEST" > /etc/xgenstack/version systemctl start xgs-agent # Health check sleep 3 if systemctl is-active --quiet xgs-agent; then ulog INFO "Update successful: $UCURRENT -> $ULATEST" rm -f "$UBAK" else ulog ERROR "Agent failed to start! Rolling back..." if [ -f "$UBAK" ]; then mv "$UBAK" /usr/local/bin/xgs-agent chmod +x /usr/local/bin/xgs-agent echo "$UCURRENT" > /etc/xgenstack/version systemctl start xgs-agent 2>/dev/null || true sleep 2 if systemctl is-active --quiet xgs-agent; then ulog INFO "Rollback successful" else ulog ERROR "Rollback failed — manual intervention needed" fi fi exit 1 fi UPDATEEOF chmod +x "${_UPDATE_BIN}" # Timer service cat > "${_UPD_SVC}" < "${_UPD_TMR}" </dev/null systemctl start xgs-update.timer 2>/dev/null ok "Auto-update timer installed (daily 03:00 +/- 30m)" } # --------------------------------------------------------------------------- # MAIN — route to command # --------------------------------------------------------------------------- main() { local cmd="${1:-auto}" shift 2>/dev/null || true case "${cmd}" in install) cmd_install "$@" ;; update) FORCE=false for arg in "$@"; do [ "${arg}" = "--force" ] && FORCE=true done cmd_update ;; remove|uninstall) cmd_remove "$@" ;; status) cmd_status ;; auto) # Auto-detect: update if installed, show help if not if is_installed; then FORCE=false cmd_update else banner echo " Usage:" echo "" echo -e " ${BOLD}Install:${NC}" echo " curl -sSL ${_PLAT}/agent | bash -s -- install \\" echo " --api-url ${_PLAT} \\" echo " --enroll-token TOKEN --server-id ID" echo "" echo -e " ${BOLD}Update:${NC} (on a server with agent already installed)" echo " curl -sSL ${_PLAT}/agent | bash -s -- update" echo "" echo -e " ${BOLD}Remove:${NC}" echo " curl -sSL ${_PLAT}/agent | bash -s -- remove [--purge]" echo "" echo -e " ${BOLD}Status:${NC}" echo " curl -sSL ${_PLAT}/agent | bash -s -- status" echo "" fi ;; -h|--help|help) banner echo " Commands:" echo " install Install agent (requires --api-url, --enroll-token, --server-id)" echo " update Update existing agent to latest version" echo " remove Remove agent (add --purge to delete config/data)" echo " status Show agent status and version" echo "" ;; *) die "Unknown command: ${cmd}. Use: install, update, remove, status" ;; esac } main "$@"