fix: add order:true to all create transactions (#3948)

This commit is contained in:
Finley Ge
2025-03-03 11:37:51 +08:00
committed by GitHub
parent 113e8f711f
commit 4bc7f21182
30 changed files with 425 additions and 352 deletions

View File

@@ -5,200 +5,203 @@ import { setupPage } from '../utils/setupPage';
import { Cluster } from 'puppeteer-cluster';
async function randomWait(min: number, max: number) {
// 随机等待时间
const delay = Math.floor(Math.random() * (max - min + 1)) + min;
return new Promise(resolve => setTimeout(resolve, delay));
// 随机等待时间
const delay = Math.floor(Math.random() * (max - min + 1)) + min;
return new Promise((resolve) => setTimeout(resolve, delay));
}
export const fetchSearchResults = async (query: string, pageCount: number, searchUrlBase: string, categories: string) => {
console.log(`Fetching Baidu search results for query: ${query}`);
// 如果 searchUrlBase 为空,返回空数组
if (!searchUrlBase) {
return { resultUrls: [], results: new Map() };
export const fetchSearchResults = async (
query: string,
pageCount: number,
searchUrlBase: string,
categories: string
) => {
console.log(`Fetching Baidu search results for query: ${query}`);
// 如果 searchUrlBase 为空,返回空数组
if (!searchUrlBase) {
return { resultUrls: [], results: new Map() };
}
const resultUrls: string[] = [];
const results = new Map<string, any>();
const pagesToFetch = Math.ceil(pageCount / 10);
const browser = await puppeteer.launch({
ignoreDefaultArgs: ['--enable-automation'],
headless: true,
executablePath: '/usr/bin/chromium', // 明确指定 Chromium 路径
pipe: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu'
// '--single-process'
]
});
const page = await browser.newPage();
await setupPage(page);
for (let i = 0; i < pagesToFetch; i++) {
const searchUrl = new URL(`${searchUrlBase}?wd=${encodeURIComponent(query)}&pn=${i * 10}`);
console.log(`Fetching page ${i + 1} from Baidu: ${searchUrl.toString()}`);
let retryCount = 0;
let success = false;
while (retryCount < 5 && !success) {
try {
console.time(`Page Load Time for page ${i + 1}`);
await page.goto(searchUrl.toString(), { waitUntil: 'load' });
console.timeEnd(`Page Load Time for page ${i + 1}`);
let content = await page.content();
let dom = new JSDOM(content);
let document = dom.window.document;
console.log(document.title);
// 如果是百度安全验证页面,重新设置页面并重新访问
if (document.title.includes('百度安全验证')) {
console.log('Detected Baidu security verification, retrying...');
await setupPage(page);
retryCount++;
//随机等待时间
await randomWait(1000, 3000);
continue;
}
// 解析搜索结果
console.time(`Link Retrieval Time for page ${i + 1}`);
const resultContainers = document.querySelectorAll('.result.c-container');
for (const result of resultContainers) {
if (resultUrls.length > pageCount + 5) {
break;
}
const titleElement = result.querySelector('h3 a');
const title = titleElement ? titleElement.textContent : '';
const url = titleElement ? titleElement.getAttribute('href') : '';
const contentElement = result.querySelector('[class^="content"]');
const content = contentElement ? contentElement.textContent : '';
if (url) {
resultUrls.push(url);
results.set(url, {
title,
url,
snippet: content,
source: 'baidu',
crawlStatus: 'Pending',
score: 0
});
}
}
console.timeEnd(`Link Retrieval Time for page ${i + 1}`);
success = true;
} catch (error) {
console.error(`Error fetching page ${i + 1}:`, error);
retryCount++;
}
}
const resultUrls: string[] = [];
const results = new Map<string, any>();
}
const pagesToFetch = Math.ceil(pageCount / 10);
await browser.close();
const browser = await puppeteer.launch({
ignoreDefaultArgs: ["--enable-automation"],
headless: true,
executablePath: "/usr/bin/chromium", // 明确指定 Chromium 路径
pipe: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
// '--single-process'
]
console.log('fetch all fake urls');
// 快速检索真实 URL
const urlsToProcessWithPuppeteer = [];
for (const url of resultUrls) {
try {
const response = await fetch(url, {
headers: {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
Referer: 'https://www.google.com/',
'Accept-Language': 'en-US,en;q=0.9',
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
Connection: 'keep-alive',
'Cache-Control': 'no-cache'
}
});
const page = await browser.newPage();
await setupPage(page);
for (let i = 0; i < pagesToFetch; i++) {
const searchUrl = new URL(`${searchUrlBase}?wd=${encodeURIComponent(query)}&pn=${i * 10}`);
console.log(`Fetching page ${i + 1} from Baidu: ${searchUrl.toString()}`);
let retryCount = 0;
let success = false;
while (retryCount < 5 && !success) {
try {
console.time(`Page Load Time for page ${i + 1}`);
await page.goto(searchUrl.toString(), { waitUntil: 'load' });
console.timeEnd(`Page Load Time for page ${i + 1}`);
let content = await page.content();
let dom = new JSDOM(content);
let document = dom.window.document;
console.log(document.title);
// 如果是百度安全验证页面,重新设置页面并重新访问
if (document.title.includes('百度安全验证')) {
console.log('Detected Baidu security verification, retrying...');
await setupPage(page);
retryCount++;
//随机等待时间
await randomWait(1000, 3000);
continue;
}
// 解析搜索结果
console.time(`Link Retrieval Time for page ${i + 1}`);
const resultContainers = document.querySelectorAll('.result.c-container');
for (const result of resultContainers) {
if (resultUrls.length > pageCount + 5) {
break;
}
const titleElement = result.querySelector('h3 a');
const title = titleElement ? titleElement.textContent : '';
const url = titleElement ? titleElement.getAttribute('href') : '';
const contentElement = result.querySelector('[class^="content"]');
const content = contentElement ? contentElement.textContent : '';
if (url) {
resultUrls.push(url);
results.set(url, {
title,
url,
snippet: content,
source: 'baidu',
crawlStatus: 'Pending',
score: 0
});
}
}
console.timeEnd(`Link Retrieval Time for page ${i + 1}`);
success = true;
} catch (error) {
console.error(`Error fetching page ${i + 1}:`, error);
retryCount++;
}
if (response.ok) {
const realUrl = response.url;
console.log('realurl:', realUrl);
const result = results.get(url);
if (result) {
result.url = realUrl;
result.crawlStatus = 'Success';
}
} else {
throw new Error(`HTTP error! status: ${response.status}`);
}
} catch (error) {
console.error(`Error fetching original URL for ${url}:`, error);
urlsToProcessWithPuppeteer.push(url);
}
}
await browser.close();
console.log('pass quickfetch');
console.log('fetch all fake urls');
// 快速检索真实 URL
const urlsToProcessWithPuppeteer = [];
for (const url of resultUrls) {
try {
const response = await fetch(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Referer': 'https://www.google.com/',
'Accept-Language': 'en-US,en;q=0.9',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Connection': 'keep-alive',
'Cache-Control': 'no-cache'
}
});
if (response.ok) {
const realUrl = response.url;
console.log('realurl:', realUrl);
const result = results.get(url);
if (result) {
result.url = realUrl;
result.crawlStatus = 'Success';
}
} else {
throw new Error(`HTTP error! status: ${response.status}`);
}
} catch (error) {
console.error(`Error fetching original URL for ${url}:`, error);
urlsToProcessWithPuppeteer.push(url);
}
// 并发处理真实 URL
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_CONTEXT,
maxConcurrency: 10,
puppeteerOptions: {
ignoreDefaultArgs: ['--enable-automation'],
headless: 'true',
executablePath: '/usr/bin/chromium', // 明确指定 Chromium 路径
pipe: true,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu']
}
});
console.log('pass quickfetch');
let failedUrlCount = 0;
// 并发处理真实 URL
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_CONTEXT,
maxConcurrency: 10,
puppeteerOptions: {
ignoreDefaultArgs: ["--enable-automation"],
headless: "true",
executablePath: "/usr/bin/chromium", // 明确指定 Chromium 路径
pipe: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
]
await cluster.task(async ({ page, data: url }) => {
let retryUrlCount = 0;
let urlSuccess = false;
while (retryUrlCount < 3 && !urlSuccess) {
console.log(`Fetching original URL for ${url}, attempt ${retryUrlCount + 1}`);
try {
await page.goto(url, { waitUntil: 'load' });
// 检查页面是否被分离
if (page.isClosed()) {
throw new Error('Page has been closed');
}
});
let failedUrlCount = 0;
await cluster.task(async ({ page, data: url }) => {
let retryUrlCount = 0;
let urlSuccess = false;
while (retryUrlCount < 3 && !urlSuccess) {
console.log(`Fetching original URL for ${url}, attempt ${retryUrlCount + 1}`);
try {
await page.goto(url, { waitUntil: 'load' });
// 检查页面是否被分离
if (page.isClosed()) {
throw new Error('Page has been closed');
}
const realUrl = page.url(); // 获取真实 URL
const result = results.get(url);
if (result) {
result.url = realUrl;
result.crawlStatus = 'Success';
}
urlSuccess = true;
} catch (error) {
console.error(`Error fetching original URL, retrying...`, error);
retryUrlCount++;
await randomWait(1000, 3000);
}
const realUrl = page.url(); // 获取真实 URL
const result = results.get(url);
if (result) {
result.url = realUrl;
result.crawlStatus = 'Success';
}
if (!urlSuccess) {
failedUrlCount++;
}
});
for (const url of urlsToProcessWithPuppeteer) {
cluster.queue(url);
urlSuccess = true;
} catch (error) {
console.error(`Error fetching original URL, retrying...`, error);
retryUrlCount++;
await randomWait(1000, 3000);
}
}
if (!urlSuccess) {
failedUrlCount++;
}
});
await cluster.idle();
await cluster.close();
for (const url of urlsToProcessWithPuppeteer) {
cluster.queue(url);
}
console.log(`Number of URLs that failed to return a real URL: ${failedUrlCount}`);
await cluster.idle();
await cluster.close();
// 过滤并返回前 pageCount 个结果
const filteredResults = Array.from(results.values()).slice(0, pageCount);
console.log(`Number of URLs that failed to return a real URL: ${failedUrlCount}`);
return { resultUrls: filteredResults.map(result => result.url), results: new Map(filteredResults.map(result => [result.url, result])) };
};
// 过滤并返回前 pageCount 个结果
const filteredResults = Array.from(results.values()).slice(0, pageCount);
return {
resultUrls: filteredResults.map((result) => result.url),
results: new Map(filteredResults.map((result) => [result.url, result]))
};
};

View File

@@ -6,9 +6,13 @@ dotenv.config();
const blacklistDomains = process.env.BLACKLIST ? JSON.parse(process.env.BLACKLIST) : [];
export const fetchSearchResults = async (query: string, pageCount: number, searchUrlBase: string, categories: string) => {
const MAX_PAGES = (pageCount / 10 +1) * 2+1; // 最多搜索的页面数
export const fetchSearchResults = async (
query: string,
pageCount: number,
searchUrlBase: string,
categories: string
) => {
const MAX_PAGES = (pageCount / 10 + 1) * 2 + 1; // 最多搜索的页面数
//如果searchUrlBase为空返回空数组pagecount是需要搜索结果的数量
if (!searchUrlBase) {
return { resultUrls: [], results: new Map() };
@@ -20,7 +24,9 @@ export const fetchSearchResults = async (query: string, pageCount: number, searc
let pageIndex = 0;
while (fetchedResultsCount < pageCount && pageIndex < MAX_PAGES) {
const searchUrl = new URL(`${searchUrlBase}?q=${encodeURIComponent(query)}&pageno=${pageIndex + 1}&format=json&categories=${categories}`);
const searchUrl = new URL(
`${searchUrlBase}?q=${encodeURIComponent(query)}&pageno=${pageIndex + 1}&format=json&categories=${categories}`
);
console.log(`Fetching page ${pageIndex + 1} from SearchXNG: ${searchUrl.toString()}`);
const response = await axios.get(searchUrl.toString());
const jsonResults = response.data.results;
@@ -28,7 +34,10 @@ export const fetchSearchResults = async (query: string, pageCount: number, searc
for (let index = 0; index < jsonResults.length; index++) {
const result = jsonResults[index];
const resultDomain = new URL(result.url).hostname;
if (blacklistDomains.some((domain: string) => resultDomain.endsWith(domain)) || resultDomain.includes('zhihu')) {
if (
blacklistDomains.some((domain: string) => resultDomain.endsWith(domain)) ||
resultDomain.includes('zhihu')
) {
continue;
}
resultUrls.push(result.url);
@@ -52,4 +61,4 @@ export const fetchSearchResults = async (query: string, pageCount: number, searc
}
return { resultUrls, results };
};
};