From f377db10156473fd39b83c994686e7b40a029c5c Mon Sep 17 00:00:00 2001 From: Val Alexander <68980965+BunsDev@users.noreply.github.com> Date: Fri, 17 Apr 2026 02:58:21 -0500 Subject: [PATCH] feat: add macOS screen snapshots for monitor preview (#67954) thanks @BunsDev Co-authored-by: Val Alexander <68980965+BunsDev@users.noreply.github.com> --- CHANGELOG.md | 4 + .../NodeMode/MacNodeModeCoordinator.swift | 1 + .../OpenClaw/NodeMode/MacNodeRuntime.swift | 30 +++++ .../MacNodeRuntimeMainActorServices.swift | 22 ++++ .../NodeMode/MacNodeScreenCommands.swift | 9 ++ .../OpenClaw/ScreenSnapshotService.swift | 109 ++++++++++++++++++ .../MacNodeRuntimeTests.swift | 101 ++++++++++++++++ .../Sources/OpenClawKit/ScreenCommands.swift | 25 ++++ docs/platforms/macos.md | 2 +- src/gateway/gateway-misc.test.ts | 15 +++ src/gateway/node-command-policy.ts | 2 + src/wizard/setup.gateway-config.test.ts | 2 + 12 files changed, 321 insertions(+), 1 deletion(-) create mode 100644 apps/macos/Sources/OpenClaw/ScreenSnapshotService.swift diff --git a/CHANGELOG.md b/CHANGELOG.md index 6b9f1213ec7..4ff27d9f538 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ Docs: https://docs.openclaw.ai ## Unreleased +### Changes + +- macOS/gateway: add `screen.snapshot` support for macOS app nodes, including runtime plumbing, default macOS allowlisting, and docs for monitor preview flows. (#67954) Thanks @BunsDev. + ### Fixes - Onboarding/non-interactive: preserve existing gateway auth tokens during re-onboard so active local gateway clients are not disconnected by an implicit token rotation. (#67821) Thanks @BKF-Gitty. diff --git a/apps/macos/Sources/OpenClaw/NodeMode/MacNodeModeCoordinator.swift b/apps/macos/Sources/OpenClaw/NodeMode/MacNodeModeCoordinator.swift index 5e093c49e24..9ae03784a42 100644 --- a/apps/macos/Sources/OpenClaw/NodeMode/MacNodeModeCoordinator.swift +++ b/apps/macos/Sources/OpenClaw/NodeMode/MacNodeModeCoordinator.swift @@ -146,6 +146,7 @@ final class MacNodeModeCoordinator { OpenClawCanvasA2UICommand.push.rawValue, OpenClawCanvasA2UICommand.pushJSONL.rawValue, OpenClawCanvasA2UICommand.reset.rawValue, + MacNodeScreenCommand.snapshot.rawValue, MacNodeScreenCommand.record.rawValue, OpenClawSystemCommand.notify.rawValue, OpenClawSystemCommand.which.rawValue, diff --git a/apps/macos/Sources/OpenClaw/NodeMode/MacNodeRuntime.swift b/apps/macos/Sources/OpenClaw/NodeMode/MacNodeRuntime.swift index 956abf94ad6..350a6897f8f 100644 --- a/apps/macos/Sources/OpenClaw/NodeMode/MacNodeRuntime.swift +++ b/apps/macos/Sources/OpenClaw/NodeMode/MacNodeRuntime.swift @@ -63,6 +63,8 @@ actor MacNodeRuntime { return try await self.handleCameraInvoke(req) case OpenClawLocationCommand.get.rawValue: return try await self.handleLocationInvoke(req) + case MacNodeScreenCommand.snapshot.rawValue: + return try await self.handleScreenSnapshotInvoke(req) case MacNodeScreenCommand.record.rawValue: return try await self.handleScreenRecordInvoke(req) case OpenClawSystemCommand.run.rawValue: @@ -352,6 +354,34 @@ actor MacNodeRuntime { return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload) } + private func handleScreenSnapshotInvoke(_ req: BridgeInvokeRequest) async throws -> BridgeInvokeResponse { + let params = (try? Self.decodeParams(MacNodeScreenSnapshotParams.self, from: req.paramsJSON)) ?? + MacNodeScreenSnapshotParams() + let services = await self.mainActorServices() + let capturedAtMs = Int64(Date().timeIntervalSince1970 * 1000) + let res = try await services.snapshotScreen( + screenIndex: params.screenIndex, + maxWidth: params.maxWidth, + quality: params.quality, + format: params.format) + struct ScreenSnapshotPayload: Encodable { + var format: String + var base64: String + var width: Int + var height: Int + var screenIndex: Int? + var capturedAtMs: Int64 + } + let payload = try Self.encodePayload(ScreenSnapshotPayload( + format: res.format.rawValue, + base64: res.data.base64EncodedString(), + width: res.width, + height: res.height, + screenIndex: params.screenIndex, + capturedAtMs: capturedAtMs)) + return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload) + } + private func mainActorServices() async -> any MacNodeRuntimeMainActorServices { if let cachedMainActorServices { return cachedMainActorServices } let services = await self.makeMainActorServices() diff --git a/apps/macos/Sources/OpenClaw/NodeMode/MacNodeRuntimeMainActorServices.swift b/apps/macos/Sources/OpenClaw/NodeMode/MacNodeRuntimeMainActorServices.swift index 733410b1860..2c2f8117c59 100644 --- a/apps/macos/Sources/OpenClaw/NodeMode/MacNodeRuntimeMainActorServices.swift +++ b/apps/macos/Sources/OpenClaw/NodeMode/MacNodeRuntimeMainActorServices.swift @@ -4,6 +4,13 @@ import OpenClawKit @MainActor protocol MacNodeRuntimeMainActorServices: Sendable { + func snapshotScreen( + screenIndex: Int?, + maxWidth: Int?, + quality: Double?, + format: OpenClawScreenSnapshotFormat?) async throws + -> (data: Data, format: OpenClawScreenSnapshotFormat, width: Int, height: Int) + func recordScreen( screenIndex: Int?, durationMs: Int?, @@ -21,9 +28,24 @@ protocol MacNodeRuntimeMainActorServices: Sendable { @MainActor final class LiveMacNodeRuntimeMainActorServices: MacNodeRuntimeMainActorServices, @unchecked Sendable { + private let screenSnapshotter = ScreenSnapshotService() private let screenRecorder = ScreenRecordService() private let locationService = MacNodeLocationService() + func snapshotScreen( + screenIndex: Int?, + maxWidth: Int?, + quality: Double?, + format: OpenClawScreenSnapshotFormat?) async throws + -> (data: Data, format: OpenClawScreenSnapshotFormat, width: Int, height: Int) + { + try await self.screenSnapshotter.snapshot( + screenIndex: screenIndex, + maxWidth: maxWidth, + quality: quality, + format: format) + } + func recordScreen( screenIndex: Int?, durationMs: Int?, diff --git a/apps/macos/Sources/OpenClaw/NodeMode/MacNodeScreenCommands.swift b/apps/macos/Sources/OpenClaw/NodeMode/MacNodeScreenCommands.swift index a61867c3c65..8f4cec46d36 100644 --- a/apps/macos/Sources/OpenClaw/NodeMode/MacNodeScreenCommands.swift +++ b/apps/macos/Sources/OpenClaw/NodeMode/MacNodeScreenCommands.swift @@ -1,9 +1,18 @@ import Foundation +import OpenClawKit enum MacNodeScreenCommand: String, Codable { + case snapshot = "screen.snapshot" case record = "screen.record" } +struct MacNodeScreenSnapshotParams: Codable, Equatable { + var screenIndex: Int? + var maxWidth: Int? + var quality: Double? + var format: OpenClawScreenSnapshotFormat? +} + struct MacNodeScreenRecordParams: Codable, Equatable { var screenIndex: Int? var durationMs: Int? diff --git a/apps/macos/Sources/OpenClaw/ScreenSnapshotService.swift b/apps/macos/Sources/OpenClaw/ScreenSnapshotService.swift new file mode 100644 index 00000000000..8a5b9f813e9 --- /dev/null +++ b/apps/macos/Sources/OpenClaw/ScreenSnapshotService.swift @@ -0,0 +1,109 @@ +import AppKit +import Foundation +import OpenClawKit +@preconcurrency import ScreenCaptureKit + +@MainActor +final class ScreenSnapshotService { + enum ScreenSnapshotError: LocalizedError { + case noDisplays + case invalidScreenIndex(Int) + case captureFailed(String) + case encodeFailed(String) + + var errorDescription: String? { + switch self { + case .noDisplays: + "No displays available for screen snapshot" + case let .invalidScreenIndex(idx): + "Invalid screen index \(idx)" + case let .captureFailed(message): + message + case let .encodeFailed(message): + message + } + } + } + + func snapshot( + screenIndex: Int?, + maxWidth: Int?, + quality: Double?, + format: OpenClawScreenSnapshotFormat?) async throws + -> (data: Data, format: OpenClawScreenSnapshotFormat, width: Int, height: Int) + { + let format = format ?? .jpeg + let normalized = Self.normalize(maxWidth: maxWidth, quality: quality, format: format) + + let content = try await SCShareableContent.current + let displays = content.displays.sorted { $0.displayID < $1.displayID } + guard !displays.isEmpty else { + throw ScreenSnapshotError.noDisplays + } + + let idx = screenIndex ?? 0 + guard idx >= 0, idx < displays.count else { + throw ScreenSnapshotError.invalidScreenIndex(idx) + } + let display = displays[idx] + + let filter = SCContentFilter(display: display, excludingWindows: []) + let config = SCStreamConfiguration() + let targetSize = Self.targetSize( + width: display.width, + height: display.height, + maxWidth: normalized.maxWidth) + config.width = targetSize.width + config.height = targetSize.height + config.showsCursor = true + + let cgImage: CGImage + do { + cgImage = try await SCScreenshotManager.captureImage( + contentFilter: filter, + configuration: config) + } catch { + throw ScreenSnapshotError.captureFailed(error.localizedDescription) + } + + let bitmap = NSBitmapImageRep(cgImage: cgImage) + let data: Data + switch format { + case .png: + guard let encoded = bitmap.representation(using: .png, properties: [:]) else { + throw ScreenSnapshotError.encodeFailed("png encode failed") + } + data = encoded + case .jpeg: + guard let encoded = bitmap.representation( + using: .jpeg, + properties: [.compressionFactor: normalized.quality]) + else { + throw ScreenSnapshotError.encodeFailed("jpeg encode failed") + } + data = encoded + } + + return (data: data, format: format, width: cgImage.width, height: cgImage.height) + } + + private static func normalize( + maxWidth: Int?, + quality: Double?, + format: OpenClawScreenSnapshotFormat) + -> (maxWidth: Int, quality: Double) + { + let resolvedMaxWidth = maxWidth.flatMap { $0 > 0 ? $0 : nil } ?? (format == .png ? 900 : 1600) + let resolvedQuality = min(1.0, max(0.05, quality ?? 0.72)) + return (maxWidth: resolvedMaxWidth, quality: resolvedQuality) + } + + private static func targetSize(width: Int, height: Int, maxWidth: Int) -> (width: Int, height: Int) { + guard width > 0, height > 0, width > maxWidth else { + return (width: width, height: height) + } + let scale = Double(maxWidth) / Double(width) + let targetHeight = max(1, Int((Double(height) * scale).rounded())) + return (width: maxWidth, height: targetHeight) + } +} diff --git a/apps/macos/Tests/OpenClawIPCTests/MacNodeRuntimeTests.swift b/apps/macos/Tests/OpenClawIPCTests/MacNodeRuntimeTests.swift index 38c4211f014..d6eae4d866f 100644 --- a/apps/macos/Tests/OpenClawIPCTests/MacNodeRuntimeTests.swift +++ b/apps/macos/Tests/OpenClawIPCTests/MacNodeRuntimeTests.swift @@ -78,6 +78,19 @@ struct MacNodeRuntimeTests { @Test func `handle invoke screen record uses injected services`() async throws { @MainActor final class FakeMainActorServices: MacNodeRuntimeMainActorServices, @unchecked Sendable { + func snapshotScreen( + screenIndex: Int?, + maxWidth: Int?, + quality: Double?, + format: OpenClawScreenSnapshotFormat?) async throws + -> (data: Data, format: OpenClawScreenSnapshotFormat, width: Int, height: Int) + { + _ = screenIndex + _ = maxWidth + _ = quality + return (Data("snapshot".utf8), format ?? .jpeg, 640, 360) + } + func recordScreen( screenIndex: Int?, durationMs: Int?, @@ -127,6 +140,94 @@ struct MacNodeRuntimeTests { #expect(!payload.base64.isEmpty) } + @Test func `handle invoke screen snapshot uses injected services`() async throws { + @MainActor + final class FakeMainActorServices: MacNodeRuntimeMainActorServices, @unchecked Sendable { + var snapshotCalledAtMs: Int64? + + func snapshotScreen( + screenIndex: Int?, + maxWidth: Int?, + quality: Double?, + format: OpenClawScreenSnapshotFormat?) async throws + -> (data: Data, format: OpenClawScreenSnapshotFormat, width: Int, height: Int) + { + self.snapshotCalledAtMs = Int64(Date().timeIntervalSince1970 * 1000) + #expect(screenIndex == 0) + #expect(maxWidth == 800) + #expect(quality == 0.5) + return (Data("ok".utf8), format ?? .jpeg, 800, 450) + } + + func recordScreen( + screenIndex: Int?, + durationMs: Int?, + fps: Double?, + includeAudio: Bool?, + outPath: String?) async throws -> (path: String, hasAudio: Bool) + { + let url = FileManager().temporaryDirectory + .appendingPathComponent("openclaw-test-screen-record-\(UUID().uuidString).mp4") + try Data("ok".utf8).write(to: url) + return (path: url.path, hasAudio: false) + } + + func locationAuthorizationStatus() -> CLAuthorizationStatus { + .authorizedAlways + } + + func locationAccuracyAuthorization() -> CLAccuracyAuthorization { + .fullAccuracy + } + + func currentLocation( + desiredAccuracy: OpenClawLocationAccuracy, + maxAgeMs: Int?, + timeoutMs: Int?) async throws -> CLLocation + { + _ = desiredAccuracy + _ = maxAgeMs + _ = timeoutMs + return CLLocation(latitude: 0, longitude: 0) + } + } + + let services = await MainActor.run { FakeMainActorServices() } + let runtime = MacNodeRuntime(makeMainActorServices: { services }) + + let params = MacNodeScreenSnapshotParams( + screenIndex: 0, + maxWidth: 800, + quality: 0.5, + format: .jpeg) + let json = try String(data: JSONEncoder().encode(params), encoding: .utf8) + let response = await runtime.handleInvoke( + BridgeInvokeRequest( + id: "req-screen-snapshot", + command: MacNodeScreenCommand.snapshot.rawValue, + paramsJSON: json)) + #expect(response.ok == true) + let payloadJSON = try #require(response.payloadJSON) + + struct Payload: Decodable { + var format: String + var base64: String + var width: Int + var height: Int + var capturedAtMs: Int64 + } + + let payload = try JSONDecoder().decode(Payload.self, from: Data(payloadJSON.utf8)) + #expect(payload.format == "jpeg") + #expect(payload.base64 == Data("ok".utf8).base64EncodedString()) + #expect(payload.width == 800) + #expect(payload.height == 450) + #expect(payload.capturedAtMs > 0) + let snapshotCalledAtMs = await MainActor.run { services.snapshotCalledAtMs } + #expect(snapshotCalledAtMs != nil) + #expect(payload.capturedAtMs <= snapshotCalledAtMs!) + } + @Test func `handle invoke browser proxy uses injected request`() async { let runtime = MacNodeRuntime(browserProxyRequest: { paramsJSON in #expect(paramsJSON?.contains("/tabs") == true) diff --git a/apps/shared/OpenClawKit/Sources/OpenClawKit/ScreenCommands.swift b/apps/shared/OpenClawKit/Sources/OpenClawKit/ScreenCommands.swift index dfb57ce2ab2..bfa3e79d5c7 100644 --- a/apps/shared/OpenClawKit/Sources/OpenClawKit/ScreenCommands.swift +++ b/apps/shared/OpenClawKit/Sources/OpenClawKit/ScreenCommands.swift @@ -1,9 +1,34 @@ import Foundation public enum OpenClawScreenCommand: String, Codable, Sendable { + case snapshot = "screen.snapshot" case record = "screen.record" } +public enum OpenClawScreenSnapshotFormat: String, Codable, Sendable { + case jpeg + case png +} + +public struct OpenClawScreenSnapshotParams: Codable, Sendable, Equatable { + public var screenIndex: Int? + public var maxWidth: Int? + public var quality: Double? + public var format: OpenClawScreenSnapshotFormat? + + public init( + screenIndex: Int? = nil, + maxWidth: Int? = nil, + quality: Double? = nil, + format: OpenClawScreenSnapshotFormat? = nil) + { + self.screenIndex = screenIndex + self.maxWidth = maxWidth + self.quality = quality + self.format = format + } +} + public struct OpenClawScreenRecordParams: Codable, Sendable, Equatable { public var screenIndex: Int? public var durationMs: Int? diff --git a/docs/platforms/macos.md b/docs/platforms/macos.md index d3ae432bbae..14971b36117 100644 --- a/docs/platforms/macos.md +++ b/docs/platforms/macos.md @@ -55,7 +55,7 @@ The macOS app presents itself as a node. Common commands: - Canvas: `canvas.present`, `canvas.navigate`, `canvas.eval`, `canvas.snapshot`, `canvas.a2ui.*` - Camera: `camera.snap`, `camera.clip` -- Screen: `screen.record` +- Screen: `screen.snapshot`, `screen.record` - System: `system.run`, `system.notify` The node reports a `permissions` map so agents can decide what’s allowed. diff --git a/src/gateway/gateway-misc.test.ts b/src/gateway/gateway-misc.test.ts index 1b10906790f..ad8f480fb93 100644 --- a/src/gateway/gateway-misc.test.ts +++ b/src/gateway/gateway-misc.test.ts @@ -367,6 +367,21 @@ describe("resolveNodeCommandAllowlist", () => { expect(DEFAULT_DANGEROUS_NODE_COMMANDS).toContain("sms.search"); }); + it("allows macOS screen.snapshot by default but keeps screen.record gated", () => { + const allow = resolveNodeCommandAllowlist( + {}, + { + platform: "macOS 26.3.1", + deviceFamily: "Mac", + }, + ); + + expect(DEFAULT_DANGEROUS_NODE_COMMANDS).not.toContain("screen.snapshot"); + expect(DEFAULT_DANGEROUS_NODE_COMMANDS).toContain("screen.record"); + expect(allow.has("screen.snapshot")).toBe(true); + expect(allow.has("screen.record")).toBe(false); + }); + it("can explicitly allow dangerous commands via allowCommands", () => { const allow = resolveNodeCommandAllowlist( { diff --git a/src/gateway/node-command-policy.ts b/src/gateway/node-command-policy.ts index 6712327c6a7..96a20a940c1 100644 --- a/src/gateway/node-command-policy.ts +++ b/src/gateway/node-command-policy.ts @@ -21,6 +21,7 @@ const CANVAS_COMMANDS = [ const CAMERA_COMMANDS = ["camera.list"]; const CAMERA_DANGEROUS_COMMANDS = ["camera.snap", "camera.clip"]; +const SCREEN_COMMANDS = ["screen.snapshot"]; const SCREEN_DANGEROUS_COMMANDS = ["screen.record"]; const LOCATION_COMMANDS = ["location.get"]; @@ -111,6 +112,7 @@ const PLATFORM_DEFAULTS: Record = { ...PHOTOS_COMMANDS, ...MOTION_COMMANDS, ...SYSTEM_COMMANDS, + ...SCREEN_COMMANDS, ], linux: [...SYSTEM_COMMANDS], windows: [...SYSTEM_COMMANDS], diff --git a/src/wizard/setup.gateway-config.test.ts b/src/wizard/setup.gateway-config.test.ts index 53f3a35db97..c85fb692802 100644 --- a/src/wizard/setup.gateway-config.test.ts +++ b/src/wizard/setup.gateway-config.test.ts @@ -95,6 +95,8 @@ describe("configureGatewayForSetup", () => { expect(result.settings.gatewayToken).toBe("generated-token"); expect(result.nextConfig.gateway?.nodes?.denyCommands).toEqual(DEFAULT_DANGEROUS_NODE_COMMANDS); + expect(result.nextConfig.gateway?.nodes?.denyCommands).not.toContain("screen.snapshot"); + expect(result.nextConfig.gateway?.nodes?.denyCommands).toContain("screen.record"); }); it("prefers OPENCLAW_GATEWAY_TOKEN during quickstart token setup", async () => {