Extraction Rules
Customise your response by adding extraction rules.
WebScrapingAPI allows you to extract specific sections of the webpage. You can do so by using the
extract_rules
parameter. This parameter's value is a
stringified object
, which accepts the following parameters:Parameter | Type | Description |
---|---|---|
selector
Required | string | The CSS selector. |
output | string | The output format of the selected element. Accepted values are:
- html - returns HTML format
- text - returns text format
- @[attr] - returns the attribute of the element |
all | int | Returns all possible elements. To enable, set this parameter to "1" . |
A full example of how this parameter would look like in production is:
extract_rules='{"title": {"selector": "h1", "output": "html"}, "subtitle": {"selector": "p.p-big", "output": "text"}}'
get
https://api.webscrapingapi.com/
v1
Extract Content Based on CSS Rules
The full GET request for the
extract_rules
should be:https://api.webscrapingapi.com/v1?api_key=<YOUR_API_KEY>&url=https://webscrapingapi.com&extract_rules=%7B%22title%22%3A%20%7B%22selector%22%3A%20%22h1%22%2C%20%22output%22%3A%20%22text%22%7D%2C%20%22subtitle%22%3A%20%7B%22selector%22%3A%20%22p.p-big%22%2C%20%22output%22%3A%20%22text%22%7D%7D
cURL
NodeJS
Python
PHP
Go
Java
.NET
Ruby
curl https://api.webscrapingapi.com/v1?api_key=<YOUR_API_KEY>&url=https%3A%2F%2Fwebscrapingapi.com&extract_rules=%7B%22title%22%3A%20%7B%22selector%22%3A%20%22h1%22%2C%20%22output%22%3A%20%22text%22%7D%2C%20%22subtitle%22%3A%20%7B%22selector%22%3A%20%22p.p-big%22%2C%20%22output%22%3A%20%22text%22%7D%7D
const http = require("https");
const options = {
"method": "GET",
"hostname": "api.webscrapingapi.com",
"port": null,
"path": "/v1?api_key=%7B%7Bapi_key%7D%7D&url=https%3A%2F%2Fwebscrapingapi.com&extract_rules=%7B%22title%22%3A%20%7B%22selector%22%3A%20%22h1%22%2C%20%22output%22%3A%20%22text%22%7D%2C%20%22subtitle%22%3A%20%7B%22selector%22%3A%20%22p.p-big%22%2C%20%22output%22%3A%20%22text%22%7D%7D",
"headers": {}
};
const req = http.request(options, function (res) {
const chunks = [];
res.on("data", function (chunk) {
chunks.push(chunk);
});
res.on("end", function () {
const body = Buffer.concat(chunks);
console.log(body.toString());
});
});
req.end();
import requests
API_KEY = '<YOUR_API_KEY>'
SCRAPER_URL = 'https://api.webscrapingapi.com/v1'
TARGET_URL = 'https://webscrapingapi.com/'
PARAMS = {
"api_key":API_KEY,
"url": TARGET_URL,
"extract_rules":'{"title": {"selector": "h1", "output": "html"}, "subtitle": {"selector": "p.p-big", "output": "text"}}'
}
response = requests.get(SCRAPER_URL, params=PARAMS)
print(response.text)
<?php
$curl = curl_init();
curl_setopt_array($curl, [
CURLOPT_URL => "https://api.webscrapingapi.com/v1?api_key=%7B%7Bapi_key%7D%7D&url=https%3A%2F%2Fwebscrapingapi.com&extract_rules=%7B%22title%22%3A%20%7B%22selector%22%3A%20%22h1%22%2C%20%22output%22%3A%20%22text%22%7D%2C%20%22subtitle%22%3A%20%7B%22selector%22%3A%20%22p.p-big%22%2C%20%22output%22%3A%20%22text%22%7D%7D",
CURLOPT_RETURNTRANSFER => true,
CURLOPT_ENCODING => "",
CURLOPT_MAXREDIRS => 10,
CURLOPT_TIMEOUT => 30,
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
CURLOPT_CUSTOMREQUEST => "GET",
]);
$response = curl_exec($curl);
$err = curl_error($curl);
curl_close($curl);
if ($err) {
echo "cURL Error #:" . $err;
} else {
echo $response;
}
package main
import (
"fmt"
"net/http"
"io/ioutil"
)
func main() {
url := "https://api.webscrapingapi.com/v1?api_key=%7B%7Bapi_key%7D%7D&url=https%3A%2F%2Fwebscrapingapi.com&extract_rules=%7B%22title%22%3A%20%7B%22selector%22%3A%20%22h1%22%2C%20%22output%22%3A%20%22text%22%7D%2C%20%22subtitle%22%3A%20%7B%22selector%22%3A%20%22p.p-big%22%2C%20%22output%22%3A%20%22text%22%7D%7D"
req, _ := http.NewRequest("GET", url, nil)
res, _ := http.DefaultClient.Do(req)
defer res.Body.Close()
body, _ := ioutil.ReadAll(res.Body)
fmt.Println(res)
fmt.Println(string(body))
}
HttpResponse<String> response = Unirest.get("https://api.webscrapingapi.com/v1?api_key=%7B%7Bapi_key%7D%7D&url=https%3A%2F%2Fwebscrapingapi.com&extract_rules=%7B%22title%22%3A%20%7B%22selector%22%3A%20%22h1%22%2C%20%22output%22%3A%20%22text%22%7D%2C%20%22subtitle%22%3A%20%7B%22selector%22%3A%20%22p.p-big%22%2C%20%22output%22%3A%20%22text%22%7D%7D")
.asString();
var client = new RestClient("https://api.webscrapingapi.com/v1?api_key=%7B%7Bapi_key%7D%7D&url=https%3A%2F%2Fwebscrapingapi.com&extract_rules=%7B%22title%22%3A%20%7B%22selector%22%3A%20%22h1%22%2C%20%22output%22%3A%20%22text%22%7D%2C%20%22subtitle%22%3A%20%7B%22selector%22%3A%20%22p.p-big%22%2C%20%22output%22%3A%20%22text%22%7D%7D");
var request = new RestRequest(Method.GET);
IRestResponse response = client.Execute(request);
require 'uri'
require 'net/http'
require 'openssl'
url = URI("https://api.webscrapingapi.com/v1?api_key=%7B%7Bapi_key%7D%7D&url=https%3A%2F%2Fwebscrapingapi.com&extract_rules=%7B%22title%22%3A%20%7B%22selector%22%3A%20%22h1%22%2C%20%22output%22%3A%20%22text%22%7D%2C%20%22subtitle%22%3A%20%7B%22selector%22%3A%20%22p.p-big%22%2C%20%22output%22%3A%20%22text%22%7D%7D")
http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
request = Net::HTTP::Get.new(url)
response = http.request(request)
puts response.read_body
Important! The
url
& extract_rules
parameters have to be encoded. ( i.e. &url=https%3A%2F%2Fwww.webscrapingapi.com%2F&extract_rules=%7B%22title%22%3A%20%7B%22selector... ){"title":"The leading REST API for web scraping","subtitle":"Join 10,000+ businesses that use WebScrapingAPI to gather
data at scale.WebScrapingAPI collects the HTML from any web page using a simple API and provides\n ready-to-process data
to everyone in your company.Use one API to collect data, from any website. Integrate with any development language and\n
customize your\n requests in just 30 seconds."}
Here are more examples that should help you better understand how the object passed to the
extract_rules
parameter should look like:HTML Sample | Extraction Rule | Rule Description | JSON Output |
---|---|---|---|
<div class="title">This is my title </div> | {"title":".title"} | Return the element having the CSS class .title | {"title":"<div>This is my title</div>"} |
<div><a href="https://www.webscrapingapi.com/product/">Product</a><a href="https://www.webscrapingapi.com/pricing/">Pricing</a></div> | {"links": {"selector": "a", "output": "@href", "all":"1"}} | Return the href attribute of all links on page | {"links":["https://www.webscrapingapi.com/product/","https://www.webscrapingapi.com/pricing/"]} |
<div><img src="https://www.webscrapingapi.com/assets/images/icons/full.svg?v=41d081a6f0"></div> | {"image": {"selector": "img", "output": "@src"}} | Return the src attribute of the first image available on page | {"image":"https://www.webscrapingapi.com/assets/images/icons/full.svg?v=41d081a6f0"} |
Last modified 3mo ago