Comment on page
Extraction Rules
Customize your response by adding extraction rules.
WebScrapingAPI allows you to extract specific sections of the webpage. You can do so by using the
extract_rules
parameter. This parameter's value can be a
string
(the CSS selector or XPath) or a stringified object
. In the second case, the parameter accepts the following options:Parameter | Type | Description |
---|---|---|
selector
Required | string | The CSS selector or the XPath. |
selector_type | string | The type of the selector option. Accepted values are css and xpath . The default value is xpath if the selector option starts with / , and css otherwise. |
output | string | The output format of the selected element. Accepted values are:
- html - returns HTML format
- text - (default) returns text format
- @[attr] - returns the attribute of the element
- table_json - returns the JSON format of a table
- table_array - returns the array format of a table
- another extract_rules object - used to parse nested elements. |
all | int | Returns all possible elements. The default value for this parameter is "1" . |
clean | int | Removes leading and trailing white spaces, line terminator characters, and newlines from the result. The default value for this parameter is "1" . |
A full example of how this parameter would look in production is:
extract_rules='{"title": {"selector": "h1", "output": "html"}, "subtitle": {"selector": "p.p-big", "output": "text"}}'
or:
extract_rules='{"title": "h1"}'
get
https://api.webscrapingapi.com/
v1
Extract Content Based on CSS Rules
The full GET request for the
extract_rules
should be:https://api.webscrapingapi.com/v1?api_key=<YOUR_API_KEY>&url=https://webscrapingapi.com&extract_rules=%7B%22title%22%3A%20%7B%22selector%22%3A%20%22h1%22%2C%20%22output%22%3A%20%22text%22%7D%2C%20%22subtitle%22%3A%20%7B%22selector%22%3A%20%22p.p-big%22%2C%20%22output%22%3A%20%22text%22%7D%7D
cURL
NodeJS
Python
PHP
Go
Java
.NET
Ruby
curl https://api.webscrapingapi.com/v1?api_key=<YOUR_API_KEY>&url=https%3A%2F%2Fwebscrapingapi.com&extract_rules=%7B%22title%22%3A%20%7B%22selector%22%3A%20%22h1%22%2C%20%22output%22%3A%20%22text%22%7D%2C%20%22subtitle%22%3A%20%7B%22selector%22%3A%20%22p.p-big%22%2C%20%22output%22%3A%20%22text%22%7D%7D
const http = require("https");
const options = {
"method": "GET",
"hostname": "api.webscrapingapi.com",
"port": null,
"path": "/v1?api_key=%7B%7Bapi_key%7D%7D&url=https%3A%2F%2Fwebscrapingapi.com&extract_rules=%7B%22title%22%3A%20%7B%22selector%22%3A%20%22h1%22%2C%20%22output%22%3A%20%22text%22%7D%2C%20%22subtitle%22%3A%20%7B%22selector%22%3A%20%22p.p-big%22%2C%20%22output%22%3A%20%22text%22%7D%7D",
"headers": {}
};
const req = http.request(options, function (res) {
const chunks = [];
res.on("data", function (chunk) {
chunks.push(chunk);
});
res.on("end", function () {
const body = Buffer.concat(chunks);
console.log(body.toString());
});
});
req.end();
import requests
API_KEY = '<YOUR_API_KEY>'
SCRAPER_URL = 'https://api.webscrapingapi.com/v1'
TARGET_URL = 'https://webscrapingapi.com/'
PARAMS = {
"api_key":API_KEY,
"url": TARGET_URL,
"extract_rules":'{"title": {"selector": "h1", "output": "html"}, "subtitle": {"selector": "p.p-big", "output": "text"}}'
}
response = requests.get(SCRAPER_URL, params=PARAMS)
print(response.text)
<?php
$curl = curl_init();
curl_setopt_array($curl, [
CURLOPT_URL => "https://api.webscrapingapi.com/v1?api_key=%7B%7Bapi_key%7D%7D&url=https%3A%2F%2Fwebscrapingapi.com&extract_rules=%7B%22title%22%3A%20%7B%22selector%22%3A%20%22h1%22%2C%20%22output%22%3A%20%22text%22%7D%2C%20%22subtitle%22%3A%20%7B%22selector%22%3A%20%22p.p-big%22%2C%20%22output%22%3A%20%22text%22%7D%7D",
CURLOPT_RETURNTRANSFER => true,
CURLOPT_ENCODING => "",
CURLOPT_MAXREDIRS => 10,
CURLOPT_TIMEOUT => 30,
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
CURLOPT_CUSTOMREQUEST => "GET",
]);
$response = curl_exec($curl);
$err = curl_error($curl);
curl_close($curl);
if ($err) {
echo "cURL Error #:" . $err;
} else {
echo $response;
}
package main
import (
"fmt"
"net/http"
"io/ioutil"
)
func main() {
url := "https://api.webscrapingapi.com/v1?api_key=%7B%7Bapi_key%7D%7D&url=https%3A%2F%2Fwebscrapingapi.com&extract_rules=%7B%22title%22%3A%20%7B%22selector%22%3A%20%22h1%22%2C%20%22output%22%3A%20%22text%22%7D%2C%20%22subtitle%22%3A%20%7B%22selector%22%3A%20%22p.p-big%22%2C%20%22output%22%3A%20%22text%22%7D%7D"
req, _ := http.NewRequest("GET", url, nil)
res, _ := http.DefaultClient.Do(req)
defer res.Body.Close()
body, _ := ioutil.ReadAll(res.Body)
fmt.Println(res)
fmt.Println(string(body))
}
HttpResponse<String> response = Unirest.get("https://api.webscrapingapi.com/v1?api_key=%7B%7Bapi_key%7D%7D&url=https%3A%2F%2Fwebscrapingapi.com&extract_rules=%7B%22title%22%3A%20%7B%22selector%22%3A%20%22h1%22%2C%20%22output%22%3A%20%22text%22%7D%2C%20%22subtitle%22%3A%20%7B%22selector%22%3A%20%22p.p-big%22%2C%20%22output%22%3A%20%22text%22%7D%7D")
.asString();
var client = new RestClient("https://api.webscrapingapi.com/v1?api_key=%7B%7Bapi_key%7D%7D&url=https%3A%2F%2Fwebscrapingapi.com&extract_rules=%7B%22title%22%3A%20%7B%22selector%22%3A%20%22h1%22%2C%20%22output%22%3A%20%22text%22%7D%2C%20%22subtitle%22%3A%20%7B%22selector%22%3A%20%22p.p-big%22%2C%20%22output%22%3A%20%22text%22%7D%7D");
var request = new RestRequest(Method.GET);
IRestResponse response = client.Execute(request);
require 'uri'
require 'net/http'
require 'openssl'
url = URI("https://api.webscrapingapi.com/v1?api_key=%7B%7Bapi_key%7D%7D&url=https%3A%2F%2Fwebscrapingapi.com&extract_rules=%7B%22title%22%3A%20%7B%22selector%22%3A%20%22h1%22%2C%20%22output%22%3A%20%22text%22%7D%2C%20%22subtitle%22%3A%20%7B%22selector%22%3A%20%22p.p-big%22%2C%20%22output%22%3A%20%22text%22%7D%7D")
http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
request = Net::HTTP::Get.new(url)
response = http.request(request)
puts response.read_body
Important! The
url
& extract_rules
parameters have to be encoded. ( i.e. &url=https%3A%2F%2Fwww.webscrapingapi.com%2F&extract_rules=%7B%22title%22%3A%20%7B%22selector... ){"title":"The leading REST API for web scraping","subtitle":"Join 10,000+ businesses that use WebScrapingAPI to gather
data at scale.WebScrapingAPI collects the HTML from any web page using a simple API and provides\n ready-to-process data
to everyone in your company.Use one API to collect data, from any website. Integrate with any development language and\n
customize your\n requests in just 30 seconds."}
Here are more examples that should help you better understand how the object passed to the
extract_rules
parameter should look like:HTML Sample | Extraction Rule | Rule Description | JSON Output |
---|---|---|---|
<div class="title"> This is my title </div> | {"title": ".title"} | Return the text content of the elements having the CSS class .title | { "title": [ "This is my title" ] } |
<div> <a href="https://www.webscrapingapi.com/product/"> Product </a> <a href="https://www.webscrapingapi.com/pricing/"> Pricing </a> </div> | { "links": { "selector": "a", "output": "@href", "all": "1" } } | Return the href attribute of all links on page | { "links": [ "https://www.webscrapingapi.com/product/","https://www.webscrapingapi.com/pricing/" ] } |
<div> <img src="https://www.webscrapingapi.com/assets/images/icons/full.svg?v=41d081a6f0" > </div> | { "image": { "selector": "img", "output": "@src", "all": 0, } } | Return the src attribute of the first image available on page | { "image": [ "https://www.webscrapingapi.com/assets/images/icons/full.svg?v=41d081a6f0" ] } |
<table class="ants">
<thead> <tr> <th>Region</th> <th>No. species</th> </tr> </thead> <tbody> <tr> <td>Europe</td> <td>180</td> </tr> </tbody>
</table> | { "table": { "selector": ".ants", "output": "table_json", "all": 0 } } | Return the JSON format of the first table having the CSS class .ants | { "table": [ { "Region: "Europe", "No. species": "180" } ] } |
<table class="ants">
<thead> <tr> <th>Region</th> <th>No. species</th> </tr> </thead> <tbody> <tr> <td>Europe</td> <td>180</td> </tr> </tbody>
</table> | { "table": { "selector": ".ants", "output": "table_array", "all": 0 } } | Return the array format of the first table having the CSS class .ants | { "table": [ ["Europe", "180"] ] } |
<ul> <li> <p class="name">Item1</p> <p class="price">100</p> </li> <li> <p class="name">Item2</p> <p class="price">1000</p> </li>
</ul> | { "items": { "selector": "li", "output": { "name": { "selector": ".name", "all": 0, "price": { "selector": ".price", "all": 0 } }, "all": 1 } } | Return the name and the price of each list item. | { "items": [ { "name": "Item1", "price": "100" }, { "name": "Item2", "price": "1000" } ] } |
Last modified 9mo ago