120628a01c
This fixes the data race uncovered by the recent refactor to run all tests as tabs under the same browser. The problem was that a write on the pages map could be done from the goroutine calling NewContext to create a new map, while other goroutines could similarly read or write the same map. Instead of adding a lock around the map, make one of the Browser's goroutines be the sole user of the map. To make that extra obvious and avoid potential races in the future, declare the map inside the goroutine's scope. For some reason, this makes the Attributes tests flakier than before. For now, add short sleeps; we can investigate that separately, now that the data races are gone.
339 lines
7.9 KiB
Go
339 lines
7.9 KiB
Go
// Package chromedp is a high level Chrome DevTools Protocol client that
|
|
// simplifies driving browsers for scraping, unit testing, or profiling web
|
|
// pages using the CDP.
|
|
//
|
|
// chromedp requires no third-party dependencies, implementing the async Chrome
|
|
// DevTools Protocol entirely in Go.
|
|
package chromedp
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"log"
|
|
"sync/atomic"
|
|
|
|
"github.com/chromedp/cdproto"
|
|
"github.com/chromedp/cdproto/cdp"
|
|
"github.com/chromedp/cdproto/runtime"
|
|
"github.com/chromedp/cdproto/target"
|
|
"github.com/mailru/easyjson"
|
|
)
|
|
|
|
// Browser is the high-level Chrome DevTools Protocol browser manager, handling
|
|
// the browser process runner, WebSocket clients, associated targets, and
|
|
// network, page, and DOM events.
|
|
type Browser struct {
|
|
userDataDir string
|
|
|
|
conn Transport
|
|
|
|
// next is the next message id.
|
|
next int64
|
|
|
|
// tabQueue is the queue used to create new target handlers, once a new
|
|
// tab is created and attached to. The newly created Target is sent back
|
|
// via tabResult.
|
|
tabQueue chan target.SessionID
|
|
tabResult chan *Target
|
|
|
|
// cmdQueue is the outgoing command queue.
|
|
cmdQueue chan cmdJob
|
|
|
|
// logging funcs
|
|
logf func(string, ...interface{})
|
|
errf func(string, ...interface{})
|
|
}
|
|
|
|
type cmdJob struct {
|
|
msg *cdproto.Message
|
|
resp chan *cdproto.Message
|
|
}
|
|
|
|
// NewBrowser creates a new browser.
|
|
func NewBrowser(ctx context.Context, urlstr string, opts ...BrowserOption) (*Browser, error) {
|
|
conn, err := DialContext(ctx, ForceIP(urlstr))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
b := &Browser{
|
|
conn: conn,
|
|
|
|
tabQueue: make(chan target.SessionID, 1),
|
|
tabResult: make(chan *Target, 1),
|
|
|
|
cmdQueue: make(chan cmdJob),
|
|
|
|
logf: log.Printf,
|
|
}
|
|
|
|
// apply options
|
|
for _, o := range opts {
|
|
if err := o(b); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
// ensure errf is set
|
|
if b.errf == nil {
|
|
b.errf = func(s string, v ...interface{}) { b.logf("ERROR: "+s, v...) }
|
|
}
|
|
|
|
go b.run(ctx)
|
|
return b, nil
|
|
}
|
|
|
|
// Shutdown shuts down the browser.
|
|
func (b *Browser) Shutdown() error {
|
|
if b.conn != nil {
|
|
if err := b.send(cdproto.CommandBrowserClose, nil); err != nil {
|
|
b.errf("could not close browser: %v", err)
|
|
}
|
|
return b.conn.Close()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// send writes the supplied message and params.
|
|
func (b *Browser) send(method cdproto.MethodType, params easyjson.RawMessage) error {
|
|
msg := &cdproto.Message{
|
|
ID: atomic.AddInt64(&b.next, 1),
|
|
Method: method,
|
|
Params: params,
|
|
}
|
|
return b.conn.Write(msg)
|
|
}
|
|
|
|
func (b *Browser) newExecutorForTarget(ctx context.Context, sessionID target.SessionID) *Target {
|
|
if sessionID == "" {
|
|
panic("empty session ID")
|
|
}
|
|
b.tabQueue <- sessionID
|
|
return <-b.tabResult
|
|
}
|
|
|
|
func (b *Browser) Execute(ctx context.Context, method string, params json.Marshaler, res json.Unmarshaler) error {
|
|
paramsMsg := emptyObj
|
|
if params != nil {
|
|
var err error
|
|
if paramsMsg, err = json.Marshal(params); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
id := atomic.AddInt64(&b.next, 1)
|
|
ch := make(chan *cdproto.Message, 1)
|
|
b.cmdQueue <- cmdJob{
|
|
msg: &cdproto.Message{
|
|
ID: id,
|
|
Method: cdproto.MethodType(method),
|
|
Params: paramsMsg,
|
|
},
|
|
resp: ch,
|
|
}
|
|
select {
|
|
case msg := <-ch:
|
|
switch {
|
|
case msg == nil:
|
|
return ErrChannelClosed
|
|
case msg.Error != nil:
|
|
return msg.Error
|
|
case res != nil:
|
|
return json.Unmarshal(msg.Result, res)
|
|
}
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
type tabEvent struct {
|
|
sessionID target.SessionID
|
|
msg *cdproto.Message
|
|
}
|
|
|
|
func (b *Browser) run(ctx context.Context) {
|
|
defer b.conn.Close()
|
|
|
|
// add cancel to context
|
|
ctx, cancel := context.WithCancel(ctx)
|
|
defer cancel()
|
|
|
|
// tabEventQueue is the queue of incoming target events, to be routed by
|
|
// their session ID.
|
|
tabEventQueue := make(chan tabEvent, 1)
|
|
|
|
// resQueue is the incoming command result queue.
|
|
resQueue := make(chan *cdproto.Message, 1)
|
|
|
|
// This goroutine continuously reads events from the websocket
|
|
// connection. The separate goroutine is needed since a websocket read
|
|
// is blocking, so it cannot be used in a select statement.
|
|
go func() {
|
|
defer cancel()
|
|
for {
|
|
msg, err := b.conn.Read()
|
|
if err != nil {
|
|
return
|
|
}
|
|
if msg.Method == cdproto.EventRuntimeExceptionThrown {
|
|
ev := new(runtime.EventExceptionThrown)
|
|
if err := json.Unmarshal(msg.Params, ev); err != nil {
|
|
b.errf("%s", err)
|
|
continue
|
|
}
|
|
b.errf("%+v\n", ev.ExceptionDetails)
|
|
continue
|
|
}
|
|
|
|
var sessionID target.SessionID
|
|
if msg.Method == cdproto.EventTargetReceivedMessageFromTarget {
|
|
event := new(target.EventReceivedMessageFromTarget)
|
|
if err := json.Unmarshal(msg.Params, event); err != nil {
|
|
b.errf("%s", err)
|
|
continue
|
|
}
|
|
sessionID = event.SessionID
|
|
msg = new(cdproto.Message)
|
|
if err := json.Unmarshal([]byte(event.Message), msg); err != nil {
|
|
b.errf("%s", err)
|
|
continue
|
|
}
|
|
}
|
|
switch {
|
|
case msg.Method != "":
|
|
if sessionID == "" {
|
|
// TODO: are we interested in browser events?
|
|
continue
|
|
}
|
|
tabEventQueue <- tabEvent{
|
|
sessionID: sessionID,
|
|
msg: msg,
|
|
}
|
|
case msg.ID != 0:
|
|
// We can't process the response here, as it's
|
|
// another goroutine that maintans respByID.
|
|
resQueue <- msg
|
|
default:
|
|
b.errf("ignoring malformed incoming message (missing id or method): %#v", msg)
|
|
}
|
|
}
|
|
}()
|
|
|
|
// This goroutine handles tabs, as well as routing events to each tab
|
|
// via the pages map.
|
|
go func() {
|
|
defer cancel()
|
|
|
|
// This map is only safe for use within this goroutine, so don't
|
|
// declare it as a Browser field.
|
|
pages := make(map[target.SessionID]*Target, 1024)
|
|
for {
|
|
select {
|
|
case sessionID := <-b.tabQueue:
|
|
if _, ok := pages[sessionID]; ok {
|
|
b.errf("executor for %q already exists", sessionID)
|
|
}
|
|
t := &Target{
|
|
browser: b,
|
|
SessionID: sessionID,
|
|
|
|
eventQueue: make(chan *cdproto.Message, 1024),
|
|
waitQueue: make(chan func(cur *cdp.Frame) bool, 1024),
|
|
frames: make(map[cdp.FrameID]*cdp.Frame),
|
|
|
|
logf: b.logf,
|
|
errf: b.errf,
|
|
}
|
|
go t.run(ctx)
|
|
pages[sessionID] = t
|
|
b.tabResult <- t
|
|
case event := <-tabEventQueue:
|
|
page, ok := pages[event.sessionID]
|
|
if !ok {
|
|
b.errf("unknown session ID %q", event.sessionID)
|
|
continue
|
|
}
|
|
select {
|
|
case page.eventQueue <- event.msg:
|
|
default:
|
|
panic("eventQueue is full")
|
|
}
|
|
|
|
case <-ctx.Done():
|
|
return
|
|
}
|
|
}
|
|
}()
|
|
|
|
respByID := make(map[int64]chan *cdproto.Message)
|
|
|
|
// This goroutine handles sending commands to the browser, and sending
|
|
// responses back for each of these commands via respByID.
|
|
for {
|
|
select {
|
|
case res := <-resQueue:
|
|
resp, ok := respByID[res.ID]
|
|
if !ok {
|
|
b.errf("id %d not present in response map", res.ID)
|
|
continue
|
|
}
|
|
if resp != nil {
|
|
// resp could be nil, if we're not interested in
|
|
// this response; for CommandSendMessageToTarget.
|
|
resp <- res
|
|
close(resp)
|
|
}
|
|
delete(respByID, res.ID)
|
|
|
|
case q := <-b.cmdQueue:
|
|
if _, ok := respByID[q.msg.ID]; ok {
|
|
b.errf("id %d already present in response map", q.msg.ID)
|
|
continue
|
|
}
|
|
respByID[q.msg.ID] = q.resp
|
|
|
|
if q.msg.Method == "" {
|
|
// Only register the chananel in respByID;
|
|
// useful for CommandSendMessageToTarget.
|
|
continue
|
|
}
|
|
if err := b.conn.Write(q.msg); err != nil {
|
|
b.errf("%s", err)
|
|
continue
|
|
}
|
|
|
|
case <-ctx.Done():
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// BrowserOption is a browser option.
|
|
type BrowserOption func(*Browser) error
|
|
|
|
// WithLogf is a browser option to specify a func to receive general logging.
|
|
func WithLogf(f func(string, ...interface{})) BrowserOption {
|
|
return func(b *Browser) error {
|
|
b.logf = f
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// WithErrorf is a browser option to specify a func to receive error logging.
|
|
func WithErrorf(f func(string, ...interface{})) BrowserOption {
|
|
return func(b *Browser) error {
|
|
b.errf = f
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// WithConsolef is a browser option to specify a func to receive chrome log events.
|
|
//
|
|
// Note: NOT YET IMPLEMENTED.
|
|
func WithConsolef(f func(string, ...interface{})) BrowserOption {
|
|
return func(b *Browser) error {
|
|
return nil
|
|
}
|
|
}
|