Files
anything-llm/server/utils/agents/aibitat/plugins/web-scraping.js
Akhil cb4a06ce5c fix: surface readable error messages in web-scraping agent and ai-provider (#5476)
* fix: surface readable error messages in web-scraping agent and ai-provider

* simplify

---------

Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
2026-04-21 15:00:12 -07:00

159 lines
5.6 KiB
JavaScript

const { CollectorApi } = require("../../../collectorApi");
const Provider = require("../providers/ai-provider");
const { summarizeContent } = require("../utils/summarize");
const webScraping = {
name: "web-scraping",
startupConfig: {
params: {},
},
plugin: function () {
return {
name: this.name,
setup(aibitat) {
aibitat.function({
super: aibitat,
name: this.name,
controller: new AbortController(),
description:
"Read and extract content from a specific webpage URL. Fetch the text from a website, get the contents of a link, or visit a URL to see what it says. Use when you have a specific web address to read.",
examples: [
{
prompt: "Read that URL for me",
call: JSON.stringify({ url: "https://example.com" }),
},
{
prompt: "What is anythingllm.com about?",
call: JSON.stringify({ url: "https://anythingllm.com" }),
},
{
prompt: "Scrape https://example.com",
call: JSON.stringify({ url: "https://example.com" }),
},
],
parameters: {
$schema: "http://json-schema.org/draft-07/schema#",
type: "object",
properties: {
url: {
type: "string",
format: "uri",
description:
"A complete web address URL including protocol. Assumes https if not provided.",
},
},
additionalProperties: false,
},
handler: async function ({ url }) {
try {
if (url) return await this.scrape(url);
return "There is nothing we can do. This function call returns no information.";
} catch (error) {
const errorMessage = error?.message ?? JSON.stringify(error);
this.super.handlerProps.log(
`Web Scraping Error: ${errorMessage}`
);
this.super.introspect(
`${this.caller}: Web Scraping Error: ${errorMessage}`
);
return `There was an error while calling the function. No data or response was found. Let the user know this was the error: ${errorMessage}`;
}
},
/**
* Report a URL citation to be displayed in the chat UI.
* @param {string} url - The URL that was accessed
* @param {string} content - The content retrieved from the URL
*/
reportUrlCitation: function (url, content) {
try {
const urlObj = new URL(url);
this.super.addCitation?.({
id: url,
title: urlObj.hostname + urlObj.pathname,
text: content,
chunkSource: `link://${url}`,
score: null,
});
} catch {
// URL parsing failed, still add citation without parsed title
this.super.addCitation?.({
id: url,
title: url,
text: content,
chunkSource: `link://${url}`,
score: null,
});
}
},
/**
* Scrape a website and summarize the content based on objective if the content is too large.
* Objective is the original objective & task that user give to the agent, url is the url of the website to be scraped.
* Here we can leverage the document collector to get raw website text quickly.
*
* @param url
* @returns
*/
scrape: async function (url) {
this.super.introspect(
`${this.caller}: Scraping the content of ${url}`
);
const { success, content } =
await new CollectorApi().getLinkContent(url);
if (!success) {
this.super.introspect(
`${this.caller}: could not scrape ${url}. I can't use this page's content.`
);
throw new Error(
`URL could not be scraped and no content was found.`
);
}
if (!content || content?.length === 0) {
throw new Error("There was no content to be collected or read.");
}
this.reportUrlCitation(url, content);
const { TokenManager } = require("../../../helpers/tiktoken");
const tokenEstimate = new TokenManager(
this.super.model
).countFromString(content);
if (
tokenEstimate <
Provider.contextLimit(this.super.provider, this.super.model)
) {
this.super.introspect(
`${this.caller}: Looking over the content of the page. ~${tokenEstimate} tokens.`
);
return content;
}
this.super.introspect(
`${this.caller}: This page's content exceeds the model's context limit. Summarizing it right now.`
);
this.super.onAbort(() => {
this.super.handlerProps.log(
"Abort was triggered, exiting summarization early."
);
this.controller.abort();
});
return summarizeContent({
provider: this.super.provider,
model: this.super.model,
controllerSignal: this.controller.signal,
content,
});
},
});
},
};
},
};
module.exports = {
webScraping,
};