From 2f28459f9ae8c782b220e86392cb257eaadb3c0a Mon Sep 17 00:00:00 2001 From: garrettmills Date: Tue, 22 Apr 2025 10:03:57 -0400 Subject: [PATCH] More work on early system setup init containers --- Cargo.lock | 44 +++++++++++++++---- Cargo.toml | 1 + deploy/30-service-account.yaml | 77 ++++++++++++++++++++++++++++++++++ deploy/40-statefulset.yaml | 23 ++++++++++ src/api/cluster/system.rs | 12 ++++-- src/api/db/migrations/mod.rs | 5 +++ src/api/db/mod.rs | 2 +- src/api/mod.rs | 2 +- src/main.rs | 18 +++++--- 9 files changed, 167 insertions(+), 17 deletions(-) create mode 100644 deploy/30-service-account.yaml diff --git a/Cargo.lock b/Cargo.lock index 1467b00..cd95afc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -908,12 +908,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.9" +version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +checksum = "976dd42dc7e85965fe702eb8164f21f450704bdde31faefd6471dba214cb594e" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -1171,6 +1171,16 @@ dependencies = [ "zeroize", ] +[[package]] +name = "gethostname" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed7131e57abbde63513e0e6636f76668a1ca9798dcae2df4e283cae9ee83859e" +dependencies = [ + "rustix 1.0.5", + "windows-targets 0.52.6", +] + [[package]] name = "getrandom" version = "0.2.15" @@ -1850,9 +1860,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.161" +version = "0.2.172" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1" +checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" [[package]] name = "libm" @@ -1903,6 +1913,12 @@ version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" +[[package]] +name = "linux-raw-sys" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" + [[package]] name = "lock_api" version = "0.4.12" @@ -2275,6 +2291,7 @@ dependencies = [ "dotenv", "env_logger", "futures", + "gethostname", "k8s-openapi", "kube", "log", @@ -2985,10 +3002,23 @@ dependencies = [ "bitflags 2.6.0", "errno", "libc", - "linux-raw-sys", + "linux-raw-sys 0.4.14", "windows-sys 0.52.0", ] +[[package]] +name = "rustix" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d97817398dd4bb2e6da002002db259209759911da105da92bec29ccb12cf58bf" +dependencies = [ + "bitflags 2.6.0", + "errno", + "libc", + "linux-raw-sys 0.9.4", + "windows-sys 0.59.0", +] + [[package]] name = "rustls" version = "0.23.25" @@ -3944,7 +3974,7 @@ dependencies = [ "cfg-if", "fastrand", "once_cell", - "rustix", + "rustix 0.38.38", "windows-sys 0.59.0", ] diff --git a/Cargo.toml b/Cargo.toml index 55efe82..d1456c9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,3 +24,4 @@ ssh-key = { version = "0.6.7", features = ["ed25519"] } rand = "0.8.5" kube = { version = "0.99.0", features = ["runtime", "derive"] } k8s-openapi = { version = "0.24", features = ["v1_32"] } +gethostname = "1.0.1" diff --git a/deploy/30-service-account.yaml b/deploy/30-service-account.yaml new file mode 100644 index 0000000..e63c2b3 --- /dev/null +++ b/deploy/30-service-account.yaml @@ -0,0 +1,77 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: p5x-api-sa + namespace: p5x-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: p5x-api-configmap-role + namespace: p5x-system +rules: + - apiGroups: [""] + resources: ["configmaps"] + resourceNames: ["dynamic-kv"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: p5x-api-configmap-binding + namespace: p5x-system +subjects: + - kind: ServiceAccount + name: p5x-api-sa + namespace: p5x-system +roleRef: + kind: Role + name: p5x-api-configmap-role + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: pod-self-read-role + namespace: p5x-system +rules: + - apiGroups: [""] + resources: ["pods"] + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: pod-self-read-binding + namespace: p5x-system +subjects: + - kind: ServiceAccount + name: p5x-api-sa + namespace: p5x-system +roleRef: + kind: Role + name: pod-self-read-role + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: node-label-read-role +rules: + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: node-label-read-binding +subjects: + - kind: ServiceAccount + name: p5x-api-sa + namespace: p5x-system +roleRef: + kind: ClusterRole + name: node-label-read-role + apiGroup: rbac.authorization.k8s.io diff --git a/deploy/40-statefulset.yaml b/deploy/40-statefulset.yaml index b7512b1..ab98204 100644 --- a/deploy/40-statefulset.yaml +++ b/deploy/40-statefulset.yaml @@ -19,11 +19,18 @@ spec: app: p5x-api-server spec: priorityClassName: system-cluster-critical + + # api-server interacts w/ some parts of the K8s API, so bind its service account + serviceAccountName: p5x-api-sa + volumes: + # Used for the api-server SQLite database et al - name: p5x-system-data persistentVolumeClaim: claimName: system-data + initContainers: + # Make sure the p5x-system-data disk exists in PVE and is mounted - name: ensure-system-disk image: docker.io/glmdev/p5x-api:latest envFrom: @@ -37,6 +44,22 @@ spec: - name: RUST_LOG value: 'p5x=debug,sqlx=warn,info' command: ['/p5x/p5x', 'ensure-system-disk'] + + # Transfer the p5x-system-data PVE disk to the correct node, if necessary + - name: migrate-system-disk + image: docker.io/glmdev/p5x-api:latest + envFrom: + - secretRef: + name: api-env + env: + - name: P5X_NODE_HOSTNAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: RUST_LOG + value: 'p5x=debug,sqlx=warn,info' + command: ['/p5x/p5x', 'migrate-system-disk'] + containers: - name: api-server image: docker.io/glmdev/p5x-api:latest diff --git a/src/api/cluster/system.rs b/src/api/cluster/system.rs index 97cfe2c..8d8eb41 100644 --- a/src/api/cluster/system.rs +++ b/src/api/cluster/system.rs @@ -15,6 +15,7 @@ use proxmox_api::types::VmId; use proxmox_api::UreqError; use serde_json::json; use tokio::time::sleep; +use gethostname::gethostname; use crate::api::cluster::carrier::{provision_carrier_unmanaged, terminate_carrier_unmanaged}; use crate::api::cluster::node::migrate_node_unmanaged; use crate::api::cluster::volume::create_volume_unmanaged; @@ -46,7 +47,9 @@ pub async fn ensure_system_disk(svc: &Services<'_>) -> Result<(), P5xError> { info!(target: "p5x", "Provisioning new P5x API system disk (this is a one-time fixup)..."); // Load the labels for this pod's node - let pod_name = env::var("POD_NAME").expect("Could not determine POD_NAME from environment!"); + let pod_name = env::var("POD_NAME") + .or_else(|_| gethostname().into_string()) + .expect("Could not determine pod name from environment"); let pods: Api = Api::namespaced(client.clone(), &namespace); let pod = pods.get(&pod_name).await.map_err(P5xError::KubeError)?; @@ -72,7 +75,7 @@ pub async fn ensure_system_disk(svc: &Services<'_>) -> Result<(), P5xError> { 5 * 1024 * 1024 * 1024, pve_host, pve_id, - "p5x-api-system-disk" + "system-data" ).await?; // Add it to the dynamic-kv config and save @@ -96,6 +99,7 @@ pub async fn migrate_system_disk_if_necessary(svc: &Services<'_>) -> Result<(), // Load the dynamic-kv and get the current host/mount let client = Client::try_default().await.map_err(P5xError::KubeError)?; let namespace = fs::read_to_string("/var/run/secrets/kubernetes.io/serviceaccount/namespace") + .or_else(|_| env::var("P5X_OVERRIDE_NS")) .unwrap_or_else(|_| "p5x-system".to_string()); let maps: Api = Api::namespaced(client.clone(), &namespace); @@ -107,7 +111,9 @@ pub async fn migrate_system_disk_if_necessary(svc: &Services<'_>) -> Result<(), let current_pve_id: i32 = data.get("api-pve-id").expect("Could not find api-pve-id in dynamic-kv config").parse().unwrap(); // Load the labels for this pod's node - let pod_name = env::var("POD_NAME").expect("Could not determine POD_NAME from environment!"); + let pod_name = env::var("POD_NAME") + .or_else(|_| gethostname().into_string()) + .expect("Could not determine pod name from environment"); let pods: Api = Api::namespaced(client.clone(), &namespace); let pod = pods.get(&pod_name).await.map_err(P5xError::KubeError)?; diff --git a/src/api/db/migrations/mod.rs b/src/api/db/migrations/mod.rs index 34bbc2a..7a9f501 100644 --- a/src/api/db/migrations/mod.rs +++ b/src/api/db/migrations/mod.rs @@ -1,6 +1,7 @@ use async_trait::async_trait; use rocket::{fairing, Build, Rocket}; use rocket::fairing::AdHoc; +use sea_orm::DatabaseConnection; pub use sea_orm_migration::prelude::*; use sea_orm_rocket::Database; use crate::api::Db; @@ -30,6 +31,10 @@ async fn run_migrations(rocket: Rocket) -> fairing::Result { Ok(rocket) } +pub async fn run_migrations_unmanaged(conn: &DatabaseConnection) -> Result<(), DbErr> { + Migrator::up(conn, None).await +} + pub(super) fn init() -> AdHoc { AdHoc::try_on_ignite("Applying migrations", run_migrations) } diff --git a/src/api/db/mod.rs b/src/api/db/mod.rs index 4313363..b319b42 100644 --- a/src/api/db/mod.rs +++ b/src/api/db/mod.rs @@ -1,4 +1,4 @@ -mod migrations; +pub mod migrations; use std::time::Duration; use rocket::figment::Figment; diff --git a/src/api/mod.rs b/src/api/mod.rs index 9bb23a4..004c4e1 100644 --- a/src/api/mod.rs +++ b/src/api/mod.rs @@ -1,6 +1,6 @@ use rocket::fairing::AdHoc; -mod db; +pub mod db; mod route; pub mod util; pub mod cluster; diff --git a/src/main.rs b/src/main.rs index 0e45c77..b5327ea 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,12 +2,13 @@ pub mod api; #[macro_use] extern crate rocket; use dotenv::dotenv; use rocket::{Build, Rocket}; -use log::{error, info}; +use log::{error, info, debug}; use std::{env, process}; use sea_orm::Database; use crate::api::cluster::system::{ensure_ssh_keypair, ensure_system_disk, migrate_system_disk_if_necessary}; use crate::api::services::Services; use crate::api::util::read_p5x_config; +use crate::api::db::migrations::run_migrations_unmanaged; fn configure_rocket() -> Rocket { rocket::build() @@ -27,22 +28,29 @@ async fn main() { } let mode = &args[1]; + debug!(target: "p5x", "Running with mode: {mode}"); + + // Intentionally generate this before migrating/ensuring the system disk, since that requires + // Services, and Services requires the SSH keys to exist. The keys generated during the system + // disk ops will be overwritten when the disk is mounted. + ensure_ssh_keypair().expect("Could not ensure SSH keypair exists."); + if mode == "ensure-system-disk" { let anon_db = Database::connect("sqlite::memory:").await.unwrap(); - let svc = Services::build(&anon_db).await.unwrap(); // fixme: this is going to fail because of the SSH keys + run_migrations_unmanaged(&anon_db).await.unwrap(); + let svc = Services::build(&anon_db).await.unwrap(); ensure_system_disk(&svc).await.unwrap(); return; } if mode == "migrate-system-disk" { let anon_db = Database::connect("sqlite::memory:").await.unwrap(); - let svc = Services::build(&anon_db).await.unwrap(); // fixme: this is going to fail because of the SSH keys + run_migrations_unmanaged(&anon_db).await.unwrap(); + let svc = Services::build(&anon_db).await.unwrap(); migrate_system_disk_if_necessary(&svc).await.unwrap(); return; } - ensure_ssh_keypair().expect("Could not ensure SSH keypair exists."); - let config = read_p5x_config(); // Do this so we early-fail if there are missing env vars info!(target: "p5x", "Successfully read config from environment."); info!(target: "p5x", "Cluster host: {} ({})", config.pve_host_name, config.pve_api_host);