Skip to content

Commit e0ed956

Browse files
committed
html filtering
1 parent dae228f commit e0ed956

File tree

4 files changed

+134
-4
lines changed

4 files changed

+134
-4
lines changed

docs/docs/playwright-web/Examples.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,26 @@ When I extract the HTML content of the page
116116
Then I should receive the complete HTML structure of the page
117117
```
118118

119+
You can also filter HTML content for easier analysis:
120+
121+
```bdd
122+
Given I navigate to website "https://example.com/products"
123+
When I extract the HTML content of the page filtered to remove scripts and styles
124+
Then I should receive clean HTML without JavaScript or CSS code
125+
126+
Given I navigate to website "https://example.com/products"
127+
When I extract the HTML content of the page filtered to remove meta tags
128+
Then I should receive HTML without metadata like charset, viewport, and SEO tags
129+
130+
Given I navigate to website "https://example.com/products"
131+
When I extract the HTML content using the cleanHtml option
132+
Then I should receive a clean version of the HTML without scripts, styles, comments, and meta tags
133+
134+
Given I navigate to website "https://example.com/products"
135+
When I extract only the HTML for the main product container using selector "#product-listings"
136+
Then I should receive just the HTML for the products section for easier analysis
137+
```
138+
119139
Example use case for content analysis:
120140

121141
```bdd

docs/docs/playwright-web/Supported-Tools.mdx

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,9 +283,25 @@ Get the visible text content of the current page.
283283
### playwright_get_visible_html
284284
Get the HTML content of the current page.
285285

286+
- **Inputs:**
287+
- **`selector`** *(string, optional)*:
288+
CSS selector to limit the HTML to a specific container. If provided, only returns the HTML for that element.
289+
- **`removeScripts`** *(boolean, optional, default: false)*:
290+
Remove all script tags from the HTML to reduce noise.
291+
- **`removeComments`** *(boolean, optional, default: false)*:
292+
Remove all HTML comments to clean up the output.
293+
- **`removeStyles`** *(boolean, optional, default: false)*:
294+
Remove all style tags from the HTML.
295+
- **`removeMeta`** *(boolean, optional, default: false)*:
296+
Remove all meta tags from the HTML head section.
297+
- **`minify`** *(boolean, optional, default: false)*:
298+
Minify the HTML output by removing extra whitespace.
299+
- **`cleanHtml`** *(boolean, optional, default: false)*:
300+
Convenience option that combines removeScripts, removeComments, removeStyles, and removeMeta for a cleaner HTML output.
301+
286302
- **Response:**
287303
- **`content`** *(string)*:
288-
The complete HTML content of the current page.
304+
The HTML content of the current page, optionally filtered based on the provided parameters.
289305

290306
---
291307

src/tools.ts

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -328,7 +328,15 @@ export function createToolDefinitions() {
328328
description: "Get the HTML content of the current page",
329329
inputSchema: {
330330
type: "object",
331-
properties: {},
331+
properties: {
332+
selector: { type: "string", description: "CSS selector to limit the HTML to a specific container" },
333+
removeScripts: { type: "boolean", description: "Remove all script tags from the HTML (default: false)" },
334+
removeComments: { type: "boolean", description: "Remove all HTML comments (default: false)" },
335+
removeStyles: { type: "boolean", description: "Remove all style tags from the HTML (default: false)" },
336+
removeMeta: { type: "boolean", description: "Remove all meta tags from the HTML (default: false)" },
337+
cleanHtml: { type: "boolean", description: "Perform comprehensive HTML cleaning (default: false)" },
338+
minify: { type: "boolean", description: "Minify the HTML output (default: false)" }
339+
},
332340
required: [],
333341
},
334342
},
@@ -458,4 +466,4 @@ export const tools = [
458466
...BROWSER_TOOLS,
459467
...API_TOOLS,
460468
...CODEGEN_TOOLS
461-
];
469+
];

src/tools/browser/visiblePage.ts

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,93 @@ export class VisibleHtmlTool extends BrowserToolBase {
8383
}
8484
return this.safeExecute(context, async (page) => {
8585
try {
86-
const htmlContent = await page!.content();
86+
const { selector, removeScripts, removeComments, removeStyles, removeMeta, minify, cleanHtml } = args;
87+
88+
// Get the HTML content
89+
let htmlContent: string;
90+
91+
if (selector) {
92+
// If a selector is provided, get only the HTML for that element
93+
const element = await page.$(selector);
94+
if (!element) {
95+
return createErrorResponse(`Element with selector "${selector}" not found`);
96+
}
97+
htmlContent = await page.evaluate((el) => el.outerHTML, element);
98+
} else {
99+
// Otherwise get the full page HTML
100+
htmlContent = await page.content();
101+
}
102+
103+
// Determine if we need to apply filters
104+
const shouldRemoveScripts = removeScripts || cleanHtml;
105+
const shouldRemoveComments = removeComments || cleanHtml;
106+
const shouldRemoveStyles = removeStyles || cleanHtml;
107+
const shouldRemoveMeta = removeMeta || cleanHtml;
108+
109+
// Apply filters in the browser context
110+
if (shouldRemoveScripts || shouldRemoveComments || shouldRemoveStyles || shouldRemoveMeta || minify) {
111+
htmlContent = await page.evaluate(
112+
({ html, removeScripts, removeComments, removeStyles, removeMeta, minify }) => {
113+
// Create a DOM parser to work with the HTML
114+
const parser = new DOMParser();
115+
const doc = parser.parseFromString(html, 'text/html');
116+
117+
// Remove script tags if requested
118+
if (removeScripts) {
119+
const scripts = doc.querySelectorAll('script');
120+
scripts.forEach(script => script.remove());
121+
}
122+
123+
// Remove style tags if requested
124+
if (removeStyles) {
125+
const styles = doc.querySelectorAll('style');
126+
styles.forEach(style => style.remove());
127+
}
128+
129+
// Remove meta tags if requested
130+
if (removeMeta) {
131+
const metaTags = doc.querySelectorAll('meta');
132+
metaTags.forEach(meta => meta.remove());
133+
}
134+
135+
// Remove HTML comments if requested
136+
if (removeComments) {
137+
const removeComments = (node) => {
138+
const childNodes = node.childNodes;
139+
for (let i = childNodes.length - 1; i >= 0; i--) {
140+
const child = childNodes[i];
141+
if (child.nodeType === 8) { // 8 is for comment nodes
142+
node.removeChild(child);
143+
} else if (child.nodeType === 1) { // 1 is for element nodes
144+
removeComments(child);
145+
}
146+
}
147+
};
148+
removeComments(doc.documentElement);
149+
}
150+
151+
// Get the processed HTML
152+
let result = doc.documentElement.outerHTML;
153+
154+
// Minify if requested
155+
if (minify) {
156+
// Simple minification: remove extra whitespace
157+
result = result.replace(/>\s+</g, '><').trim();
158+
}
159+
160+
return result;
161+
},
162+
{
163+
html: htmlContent,
164+
removeScripts: shouldRemoveScripts,
165+
removeComments: shouldRemoveComments,
166+
removeStyles: shouldRemoveStyles,
167+
removeMeta: shouldRemoveMeta,
168+
minify
169+
}
170+
);
171+
}
172+
87173
return createSuccessResponse(`HTML content:\n${htmlContent}`);
88174
} catch (error) {
89175
return createErrorResponse(`Failed to get visible HTML content: ${(error as Error).message}`);

0 commit comments

Comments
 (0)