木偶师-向下滚动,直到你不能再

当我向下滚动时,我处在一个新内容被创建的情况中。新内容有一个特定的类名。

如何继续向下滚动,直到所有元素都已加载?

换句话说,我希望达到这样一个阶段: 如果继续向下滚动,就不会加载任何新内容。

我用代码向下滚动,加上一个

await page.waitForSelector('.class_name');

这种方法的问题在于,在加载了所有元素之后,代码继续向下滚动,没有创建任何新元素,最终会出现超时错误。

这是密码:

await page.evaluate( () => {
window.scrollBy(0, window.innerHeight);
});
await page.waitForSelector('.class_name');
103086 次浏览

Give this a shot:

const puppeteer = require('puppeteer');


(async () => {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto('https://www.yoursite.com');
await page.setViewport({
width: 1200,
height: 800
});


await autoScroll(page);


await page.screenshot({
path: 'yoursite.png',
fullPage: true
});


await browser.close();
})();


async function autoScroll(page){
await page.evaluate(async () => {
await new Promise((resolve) => {
var totalHeight = 0;
var distance = 100;
var timer = setInterval(() => {
var scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;


if(totalHeight >= scrollHeight - window.innerHeight){
clearInterval(timer);
resolve();
}
}, 100);
});
});
}

Source: https://github.com/chenxiaochun/blog/issues/38

EDIT

added window.innerHeight to the calculation because the available scrolling distance is body height minus viewport height, not the entire body height.

Scrolling down to the bottom of the page can be accomplished in 2 ways:

  1. use scrollIntoView (to scroll to the part of the page that can create more content at the bottom) and selectors (i.e., document.querySelectorAll('.class_name').length to check whether more content has been generated)
  2. use scrollBy (to incrementally scroll down the page) and either setTimeout or setInterval (to incrementally check whether we are at the bottom of the page)

Here is an implementation using scrollIntoView and selector (assuming .class_name is the selector that we scroll into for more content) in plain JavaScript that we can run in the browser:

Method 1: use scrollIntoView and selectors

const delay = 3000;
const wait = (ms) => new Promise(res => setTimeout(res, ms));
const count = async () => document.querySelectorAll('.class_name').length;
const scrollDown = async () => {
document.querySelector('.class_name:last-child')
.scrollIntoView({ behavior: 'smooth', block: 'end', inline: 'end' });
}


let preCount = 0;
let postCount = 0;
do {
preCount = await count();
await scrollDown();
await wait(delay);
postCount = await count();
} while (postCount > preCount);
await wait(delay);

In this method, we are comparing the # of .class_name selectors before scrolling (preCount) vs after scrolling (postCount) to check whether we are at bottom of page:

if (postCount > precount) {
// NOT bottom of page
} else {
// bottom of page
}

And here are 2 possible implementations using either setTimeout or setInterval with scrollBy in plain JavaScript that we can run in the browser console:

Method 2a: use setTimeout with scrollBy

const distance = 100;
const delay = 100;
while (document.scrollingElement.scrollTop + window.innerHeight < document.scrollingElement.scrollHeight) {
document.scrollingElement.scrollBy(0, distance);
await new Promise(resolve => { setTimeout(resolve, delay); });
}

Method 2b: use setInterval with scrollBy

const distance = 100;
const delay = 100;
const timer = setInterval(() => {
document.scrollingElement.scrollBy(0, distance);
if (document.scrollingElement.scrollTop + window.innerHeight >= document.scrollingElement.scrollHeight) {
clearInterval(timer);
}
}, delay);

In this method, we are comparing document.scrollingElement.scrollTop + window.innerHeight with document.scrollingElement.scrollHeight to check whether we are at the bottom of the page:

if (document.scrollingElement.scrollTop + window.innerHeight < document.scrollingElement.scrollHeight) {
// NOT bottom of page
} else {
// bottom of page
}

If either of the JavaScript code above scrolls the page all the way down to the bottom, then we know it is working and we can automate this using Puppeteer.

Here are the sample Puppeteer Node.js scripts that will scroll down to the bottom of the page and wait a few seconds before closing the browser.

Puppeteer Method 1: use scrollIntoView with selector (.class_name)

const puppeteer = require('puppeteer');


(async () => {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
args: ['--window-size=800,600']
});
const page = await browser.newPage();
await page.goto('https://example.com');


const delay = 3000;
let preCount = 0;
let postCount = 0;
do {
preCount = await getCount(page);
await scrollDown(page);
await page.waitFor(delay);
postCount = await getCount(page);
} while (postCount > preCount);
await page.waitFor(delay);


await browser.close();
})();


async function getCount(page) {
return await page.$$eval('.class_name', a => a.length);
}


async function scrollDown(page) {
await page.$eval('.class_name:last-child', e => {
e.scrollIntoView({ behavior: 'smooth', block: 'end', inline: 'end' });
});
}

Puppeteer Method 2a: use setTimeout with scrollBy

const puppeteer = require('puppeteer');


(async () => {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
args: ['--window-size=800,600']
});
const page = await browser.newPage();
await page.goto('https://example.com');


await scrollToBottom(page);
await page.waitFor(3000);


await browser.close();
})();


async function scrollToBottom(page) {
const distance = 100; // should be less than or equal to window.innerHeight
const delay = 100;
while (await page.evaluate(() => document.scrollingElement.scrollTop + window.innerHeight < document.scrollingElement.scrollHeight)) {
await page.evaluate((y) => { document.scrollingElement.scrollBy(0, y); }, distance);
await page.waitFor(delay);
}
}

Puppeteer Method 2b: use setInterval with scrollBy

const puppeteer = require('puppeteer');


(async () => {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
args: ['--window-size=800,600']
});
const page = await browser.newPage();
await page.goto('https://example.com');


await page.evaluate(scrollToBottom);
await page.waitFor(3000);


await browser.close();
})();


async function scrollToBottom() {
await new Promise(resolve => {
const distance = 100; // should be less than or equal to window.innerHeight
const delay = 100;
const timer = setInterval(() => {
document.scrollingElement.scrollBy(0, distance);
if (document.scrollingElement.scrollTop + window.innerHeight >= document.scrollingElement.scrollHeight) {
clearInterval(timer);
resolve();
}
}, delay);
});
}

based on answer from this url

await page.evaluate(() => {
window.scrollTo(0, window.document.body.scrollHeight);
});

You might just use the following code using page.keyboard object:

await page.keyboard.press('ArrowDown');
delay(2000) //wait for 2 seconds
await page.keyboard.press('ArrowUp');
function delay(milliseconds) { //function for waiting
return new Promise(resolve => {
setTimeout(() => {
resolve();
}, milliseconds);
});
}

Many solutions here assume the page height being constant. This implementation works even if the page height changes (e.g. loading new content as user scrolls down).

await page.evaluate(() => new Promise((resolve) => {
var scrollTop = -1;
const interval = setInterval(() => {
window.scrollBy(0, 100);
if(document.documentElement.scrollTop !== scrollTop) {
scrollTop = document.documentElement.scrollTop;
return;
}
clearInterval(interval);
resolve();
}, 10);
}));

Pretty simple solution

let lastHeight = await page.evaluate('document.body.scrollHeight');


while (true) {
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
await page.waitForTimeout(2000); // sleep a bit
let newHeight = await page.evaluate('document.body.scrollHeight');
if (newHeight === lastHeight) {
break;
}
lastHeight = newHeight;
}

Much easier:

    await page.evaluate(async () => {
let scrollPosition = 0
let documentHeight = document.body.scrollHeight


while (documentHeight > scrollPosition) {
window.scrollBy(0, documentHeight)
await new Promise(resolve => {
setTimeout(resolve, 1000)
})
scrollPosition = documentHeight
documentHeight = document.body.scrollHeight
}
})

A similar solution to @EdvinTr, it's giving me great results. Scrolling and comparing with the page's Y Offset, very simple.

let originalOffset = 0;
while (true) {
await page.evaluate('window.scrollBy(0, document.body.scrollHeight)');
await page.waitForTimeout(200);
let newOffset = await page.evaluate('window.pageYOffset');
if (originalOffset === newOffset) {
break;
}
originalOffset = newOffset;
}

why not just

await page.keyboard.press("PageDown");
await page.keyboard.down('End')

basically when executing it, the playwright will hold the End key on the keyboard, if you want you can use press and add in a loop that will have the same effect.