Building your own Operator
The OpenAI computer-use-preview
model allows you to create AI agents that can control web browsers programmatically. This guide shows how to build an operator that can interact with web interfaces using natural language commands.
- Python
from playwright.sync_api import sync_playwright
from openai import OpenAI
import time
import base64
client = OpenAI()
p = sync_playwright().start()
browser = p.chromium.launch(headless=False, chromium_sandbox=True)
page = browser.new_page()
page.set_viewport_size({"width": 1024, "height": 768})
page.goto("https://x.com")
def handle_model_action(page, action):
action_type = action.type
try:
match action_type:
case "click":
button = "left" if action.button not in ["left", "right"] else action.button
page.mouse.click(action.x, action.y, button=button)
case "scroll":
page.mouse.move(action.x, action.y)
page.evaluate(f"window.scrollBy({action.scroll_x}, {action.scroll_y})")
case "keypress":
for k in action.keys:
key = {"enter": "Enter", "space": " "}.get(k.lower(), k)
page.keyboard.press(key)
case "type":
page.keyboard.type(action.text)
case "wait":
time.sleep(2)
case "screenshot":
pass
case _:
print(f"Unrecognized action: {action}")
except Exception as e:
print(f"Error handling action {action}: {e}")
def get_response(previous_response_id=None, human_input=None):
base_params = {
"model": "computer-use-preview",
"tools": [{
"type": "computer_use_preview",
"display_width": 1024,
"display_height": 768,
"environment": "browser"
}],
"truncation": "auto"
}
if previous_response_id:
base_params.update({
"previous_response_id": previous_response_id,
"reasoning": {"generate_summary": "concise"},
"input": [{"role": "user", "content": human_input}]
})
else:
base_params["input"] = [{
"role": "user",
"content": "Goto to x.com, ask the user to login, then summarize the latest tweets"
}]
return client.responses.create(**base_params)
def computer_use_loop(page, response):
while True:
computer_calls = [item for item in response.output if item.type == "computer_call"]
if not computer_calls:
return response
call = computer_calls[0]
handle_model_action(page, call.action)
time.sleep(1)
screenshot_base64 = base64.b64encode(page.screenshot()).decode("utf-8")
response = client.responses.create(
model="computer-use-preview",
previous_response_id=response.id,
tools=[{
"type": "computer_use_preview",
"display_width": 1024,
"display_height": 768,
"environment": "browser"
}],
input=[{
"call_id": call.call_id,
"type": "computer_call_output",
"output": {
"type": "input_image",
"image_url": f"data:image/png;base64,{screenshot_base64}"
}
}],
truncation="auto")
response = get_response()
while True:
try:
response = computer_use_loop(page, response)
message = next(item for item in response.output if item.type == "message")
print("\n" + "="*50)
print("🤖 Assistant: " + message.content[0].text)
print("-"*50)
human_input = input("👤 Your response: ")
response = get_response(response.id, human_input)
except Exception as e:
print(e)
break