Links

Extraction Rules

Customise your response by adding extraction rules.
Extraction rules can be applied with both JavaScript rendering enabled or disabled.
WebScrapingAPI allows you to extract specific sections of the webpage. You can do so by using the extract_rules parameter.
This parameter's value is a stringified object, which accepts the following parameters:
Parameter
Type
Description
selector Required
string
The CSS selector.
output
string
The output format of the selected element. Accepted values are: - html - returns HTML format - text - returns text format - @[attr] - returns the attribute of the element
all
int
Returns all possible elements. To enable, set this parameter to "1".
A full example of how this parameter would look like in production is:
extract_rules='{"title": {"selector": "h1", "output": "html"}, "subtitle": {"selector": "p.p-big", "output": "text"}}'

Extraction Rules Integration Examples

get
https://api.webscrapingapi.com/
v1
Extract Content Based on CSS Rules
The full GET request for the extract_rules should be:
https://api.webscrapingapi.com/v1?api_key=<YOUR_API_KEY>&url=https://webscrapingapi.com&extract_rules=%7B%22title%22%3A%20%7B%22selector%22%3A%20%22h1%22%2C%20%22output%22%3A%20%22text%22%7D%2C%20%22subtitle%22%3A%20%7B%22selector%22%3A%20%22p.p-big%22%2C%20%22output%22%3A%20%22text%22%7D%7D
cURL
NodeJS
Python
PHP
Go
Java
.NET
Ruby
curl https://api.webscrapingapi.com/v1?api_key=<YOUR_API_KEY>&url=https%3A%2F%2Fwebscrapingapi.com&extract_rules=%7B%22title%22%3A%20%7B%22selector%22%3A%20%22h1%22%2C%20%22output%22%3A%20%22text%22%7D%2C%20%22subtitle%22%3A%20%7B%22selector%22%3A%20%22p.p-big%22%2C%20%22output%22%3A%20%22text%22%7D%7D
const http = require("https");
const options = {
"method": "GET",
"hostname": "api.webscrapingapi.com",
"port": null,
"path": "/v1?api_key=%7B%7Bapi_key%7D%7D&url=https%3A%2F%2Fwebscrapingapi.com&extract_rules=%7B%22title%22%3A%20%7B%22selector%22%3A%20%22h1%22%2C%20%22output%22%3A%20%22text%22%7D%2C%20%22subtitle%22%3A%20%7B%22selector%22%3A%20%22p.p-big%22%2C%20%22output%22%3A%20%22text%22%7D%7D",
"headers": {}
};
const req = http.request(options, function (res) {
const chunks = [];
res.on("data", function (chunk) {
chunks.push(chunk);
});
res.on("end", function () {
const body = Buffer.concat(chunks);
console.log(body.toString());
});
});
req.end();
import requests
API_KEY = '<YOUR_API_KEY>'
SCRAPER_URL = 'https://api.webscrapingapi.com/v1'
TARGET_URL = 'https://webscrapingapi.com/'
PARAMS = {
"api_key":API_KEY,
"url": TARGET_URL,
"extract_rules":'{"title": {"selector": "h1", "output": "html"}, "subtitle": {"selector": "p.p-big", "output": "text"}}'
}
response = requests.get(SCRAPER_URL, params=PARAMS)
print(response.text)
<?php
$curl = curl_init();
curl_setopt_array($curl, [
CURLOPT_URL => "https://api.webscrapingapi.com/v1?api_key=%7B%7Bapi_key%7D%7D&url=https%3A%2F%2Fwebscrapingapi.com&extract_rules=%7B%22title%22%3A%20%7B%22selector%22%3A%20%22h1%22%2C%20%22output%22%3A%20%22text%22%7D%2C%20%22subtitle%22%3A%20%7B%22selector%22%3A%20%22p.p-big%22%2C%20%22output%22%3A%20%22text%22%7D%7D",
CURLOPT_RETURNTRANSFER => true,
CURLOPT_ENCODING => "",
CURLOPT_MAXREDIRS => 10,
CURLOPT_TIMEOUT => 30,
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
CURLOPT_CUSTOMREQUEST => "GET",
]);
$response = curl_exec($curl);
$err = curl_error($curl);
curl_close($curl);
if ($err) {
echo "cURL Error #:" . $err;
} else {
echo $response;
}
package main
import (
"fmt"
"net/http"
"io/ioutil"
)
func main() {
url := "https://api.webscrapingapi.com/v1?api_key=%7B%7Bapi_key%7D%7D&url=https%3A%2F%2Fwebscrapingapi.com&extract_rules=%7B%22title%22%3A%20%7B%22selector%22%3A%20%22h1%22%2C%20%22output%22%3A%20%22text%22%7D%2C%20%22subtitle%22%3A%20%7B%22selector%22%3A%20%22p.p-big%22%2C%20%22output%22%3A%20%22text%22%7D%7D"
req, _ := http.NewRequest("GET", url, nil)
res, _ := http.DefaultClient.Do(req)
defer res.Body.Close()
body, _ := ioutil.ReadAll(res.Body)
fmt.Println(res)
fmt.Println(string(body))
}
HttpResponse<String> response = Unirest.get("https://api.webscrapingapi.com/v1?api_key=%7B%7Bapi_key%7D%7D&url=https%3A%2F%2Fwebscrapingapi.com&extract_rules=%7B%22title%22%3A%20%7B%22selector%22%3A%20%22h1%22%2C%20%22output%22%3A%20%22text%22%7D%2C%20%22subtitle%22%3A%20%7B%22selector%22%3A%20%22p.p-big%22%2C%20%22output%22%3A%20%22text%22%7D%7D")
.asString();
var client = new RestClient("https://api.webscrapingapi.com/v1?api_key=%7B%7Bapi_key%7D%7D&url=https%3A%2F%2Fwebscrapingapi.com&extract_rules=%7B%22title%22%3A%20%7B%22selector%22%3A%20%22h1%22%2C%20%22output%22%3A%20%22text%22%7D%2C%20%22subtitle%22%3A%20%7B%22selector%22%3A%20%22p.p-big%22%2C%20%22output%22%3A%20%22text%22%7D%7D");
var request = new RestRequest(Method.GET);
IRestResponse response = client.Execute(request);
require 'uri'
require 'net/http'
require 'openssl'
url = URI("https://api.webscrapingapi.com/v1?api_key=%7B%7Bapi_key%7D%7D&url=https%3A%2F%2Fwebscrapingapi.com&extract_rules=%7B%22title%22%3A%20%7B%22selector%22%3A%20%22h1%22%2C%20%22output%22%3A%20%22text%22%7D%2C%20%22subtitle%22%3A%20%7B%22selector%22%3A%20%22p.p-big%22%2C%20%22output%22%3A%20%22text%22%7D%7D")
http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
request = Net::HTTP::Get.new(url)
response = http.request(request)
puts response.read_body
Important! The url & extract_rules parameters have to be encoded. ( i.e. &url=https%3A%2F%2Fwww.webscrapingapi.com%2F&extract_rules=%7B%22title%22%3A%20%7B%22selector... )
{"title":"The leading REST API for web scraping","subtitle":"Join 10,000+ businesses that use WebScrapingAPI to gather
data at scale.WebScrapingAPI collects the HTML from any web page using a simple API and provides\n ready-to-process data
to everyone in your company.Use one API to collect data, from any website. Integrate with any development language and\n
customize your\n requests in just 30 seconds."}

More extract_rules object examples

Here are more examples that should help you better understand how the object passed to the extract_rules parameter should look like:
HTML Sample
Extraction Rule
Rule Description
JSON Output
<div class="title">This is my title </div>
{"title":".title"}
Return the element having the CSS class .title
{"title":"<div>This is my title</div>"}
<div><a href="https://www.webscrapingapi.com/product/">Product</a><a href="https://www.webscrapingapi.com/pricing/">Pricing</a></div>
{"links": {"selector": "a", "output": "@href", "all":"1"}}
Return the href attribute of all links on page
{"links":["https://www.webscrapingapi.com/product/","https://www.webscrapingapi.com/pricing/"]}
<div><img src="https://www.webscrapingapi.com/assets/images/icons/full.svg?v=41d081a6f0"></div>
{"image": {"selector": "img", "output": "@src"}}
Return the src attribute of the first image available on page
{"image":"https://www.webscrapingapi.com/assets/images/icons/full.svg?v=41d081a6f0"}