3

I would like to know if it is possible to have one .js file that opens a browser instance, creates new page/tab logs in to a website (with username/password) and just stays idle.
And in a second .js file use file one browser instance and its page.

1.js

const puppeteer = require('puppeteer');

(async () => {
    const browser = await puppeteer.launch({ 
        headless: true,
        args: ['--no-sandbox'], 
        ignoreDefaultArgs: ["--hide-scrollbars"]
    });

    const page = await browser.newPage();
    const response = await page.goto('https://google.com');

    console.log('Browser open in the background (headless)!');
    //await browser.close();
})();

2.js

const puppeteer = require('puppeteer');

(async () => {
    // instructions on browser instance/page from 1.js ...
})();
CC BY-SA 4.0
2
  • create a class in 1.js and keep the browser/page state, and add relevant functions for logging in. in 2.js create an instance of the first class and interact with it. if you want to keep it idle set timeout to 0 to disable it.
    – mbit
    Commented Feb 8, 2020 at 1:33
  • @mbit sorry i'm a newbie in nodejs, could you explain me: how to keep a browser state and how to create an instance of the first class. thanks
    – skillsboy
    Commented Feb 8, 2020 at 2:22

3 Answers 3

8

The crawler object keeps the state of the browser instance and wherever you call/pass that instance, it refers to the same chromium in the "background". If this is an overkill, and you just want to connect to an already running chromium using puppeteer, you can do it with puppeteer.connect. take a look at this: How to "hook in" puppeteer into a running Chrome instance/tab – mbit

Yeah I guess its to overkill for me :). But the link you posted was what I wanted but have 2 questions.

This Is a sample what I have.

// 1.js
// open chromium, new tab, go to google.com, print browserWSEndpoint, disconnect
const puppeteer = require('puppeteer');

(async () => {
    var browser = await puppeteer.launch({headless: false});
    var page = await browser.newPage();
    var response = await page.goto('https://google.com');

    var browserWSEndpoint = browser.wsEndpoint();
    console.log(browserWSEndpoint); // prints: ws://127.0.0.1:51945/devtools/browser/6462daeb-469b-4ae4-bfd1-c3bd2f26aa5e

    browser.disconnect();
})();

And

// 2.js
// connect to the open browser with the browserWSEndpoint manualy put in, ... , disconect. 
const puppeteer = require('puppeteer');

(async () => {
    var browser = await puppeteer.connect({browserWSEndpoint: 'ws://127.0.0.1:51945/devtools/browser/6462daeb-469b-4ae4-bfd1-c3bd2f26aa5e'});

    // somehow use the tab that is open from 1.js (google.com)

    await browser.disconnect();
})();

I get the browserWSEndpoint string from the console.log 1.js. It works great but I have two difficulties.

1 - How can I use the variable browserWSEndpoint from 1.js so I dont have to always copy paste it to 2.js.

2- If I open a new page/tab on 1.js and go for example to google and disconnect (browser.disconnect()), how can use that page/tab on 2.js.

CC BY-SA 4.0
2
  • Update: For 1 used fs.writeFileSync on 1.js and fs.readFileSync on 2.js so I just get the value of the endpoint like that, but I would like to know how to do It a different way if possible. And for 2 I used browser.pages() that return an array of pages/tabs that are open and works great.
    – skillsboy
    Commented Feb 12, 2020 at 1:33
  • Why not edit that into your answer?
    – ggorlen
    Commented May 12 at 16:18
2

Working tested code getEmail.js is where actual page will be exported. ask clarifications in comments.

getBrowser.js

const puppeteer = require("puppeteer");
module.exports = {
browser: {},
pptr_instance_url:"",
getBrow: async function(){  try {
    console.log("line6",this.pptr_instance_url);
    this.browser = await puppeteer.connect({browserWSEndpoint: this.pptr_instance_url}).catch(async e =>{
        console.log("end point",this.pptr_instance_url);
        this.browser = await puppeteer.launch({timeout: 0});
        this.pptr_instance_url = this.browser.wsEndpoint();
        console.log("line 11",this.pptr_instance_url);
        return this.browser;
    });
    return this.browser;
}catch (e){
    console.log(e)
} }
}

pageRenderer.js

const abc = require("../getBrowsernew")
const pageRenderer = async (request) => {
const {reactProjectUrl} = constants, uuidStorageKey = uuidv4(),
    localStorageObject = {[uuidStorageKey]: request.body};

const browser = await abc.getBrow();
let url = "someurl.com"
await setLocalStorage(browser, url, localStorageObject);
const page = await browser.newPage();
const response = await page.goto(
    url,
    {
        waitUntil: "networkidle0"
    }, {waitUntil: 'load', timeout: 0}
);
return page;
}

module.exports = pageRenderer;

getEmail.js

const pageRenderer = require("./pageRenderer");

const getEmail =async (request) =>{

const page = await pageRenderer(request)
const emailbody =  await page.content();
page.close();
return emailbody;
  }

module.exports = getEmail;
CC BY-SA 4.0
1
1

You can implement this in many ways like having separate modules with functions, or different classes, and it depends on your particular need.

You can have a class that launches the browser and creates pages plus some extra functionalities.

//1.js
const puppeteer = require('puppeteer');

class Crawler {
    constructor() {
        //init with whatever values you'll need in your class
        //or throw an error if the object wasn't created through build
    }
    static async build() {
        let crawler = new Crawler();
        await crawler._init();
        return crawler;
    }
    async _init() {
        //launch the browser and keep its state
        this._browser = await puppeteer.launch({timeout: 0});
        //create a page and keep its state
        this._page = await this._browser.newPage();
    }
    //getter
    get browser() {
        return this._browser;
    }
    //getter
    get page() {
        return this._page;
    }
    async login(url) {
         await this._page.goto(url);
         //do whatever is related to the login process
    }
}

module.exports = {Crawler};

Note that we can't have async functions in the constructor. Since launching browser is async, we use something like a build function to initiate the browser when creating the object. Then we create the crawler object like this:

//2.js
const {Crawler} = require('./1.js');

(async() => {
    let crawler = await Crawler.build(); 
    await crawler.login("https://example.com");
    //access crawler's page
    console.log(crawler.page.url());
})();

Keep in mind that this is only an example and by no means representative of the best practices. So first, you need to understand what you want to achieve out of such encapsulation, then adopt the method that suits you best.

Read more on JS classes here

CC BY-SA 4.0

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Not the answer you're looking for? Browse other questions tagged or ask your own question.