Examples
Real-world examples and use cases for NextRows data extraction
Learn from real-world examples of NextRows in action. These examples demonstrate best practices and show how to handle common data extraction scenarios.
E-commerce Product Extraction
Basic Product Information
Extract product details from e-commerce sites:
{
"type": "url",
"data": ["https://example-store.com/products"],
"prompt": "Extract product name, price, rating, and availability status for each product"
}
{
"success": true,
"data": [
{
"product_name": "Wireless Bluetooth Headphones",
"price": "$79.99",
"rating": "4.5 out of 5 stars",
"availability": "In Stock"
},
{
"product_name": "Smart Fitness Tracker",
"price": "$149.99",
"rating": "4.2 out of 5 stars",
"availability": "Limited Stock"
}
]
}
Product Details with Schema Validation
For production use, add schema validation to ensure data quality:
import requests
def extract_products_with_validation(urls):
response = requests.post(
"https://api.nextrows.com/v1/extract",
headers={"Authorization": "Bearer sk-nr-your-api-key"},
json={
"type": "url",
"data": urls,
"prompt": "Extract product name, price in USD as number, rating as decimal, stock quantity, and product URL",
"schema": {
"type": "array",
"items": {
"type": "object",
"properties": {
"product_name": {"type": "string", "minLength": 1},
"price": {"type": "number", "minimum": 0},
"rating": {"type": "number", "minimum": 0, "maximum": 5},
"stock_quantity": {"type": "integer", "minimum": 0},
"product_url": {"type": "string", "format": "uri"}
},
"required": ["product_name", "price"]
}
}
}
)
return response.json()
# Usage
product_urls = [
"https://store.com/electronics",
"https://store.com/clothing",
"https://store.com/home-garden"
]
results = extract_products_with_validation(product_urls)
Job Board Scraping
Basic Job Listings
Extract job postings from career sites:
curl -X POST https://api.nextrows.com/v1/extract \
-H "Authorization: Bearer sk-nr-your-api-key" \
-H "Content-Type: application/json" \
-d '{
"type": "url",
"data": ["https://jobs.example.com/tech-jobs"],
"prompt": "Extract job title, company name, location, salary range, experience level, and application deadline for each job posting"
}'
Real Estate Listings
Property Details Extraction
Extract comprehensive property information:
const axios = require('axios');
async function extractRealEstateListings(urls) {
try {
const response = await axios.post(
'https://api.nextrows.com/v1/extract',
{
type: 'url',
data: urls,
prompt: `Extract property details including:
- Property address (full address)
- Price (as number without currency symbols)
- Property type (house, condo, townhouse, etc.)
- Bedrooms and bathrooms count
- Square footage
- Lot size
- Year built
- Listing agent name and contact
- Days on market
- Property description summary (first 200 characters)`,
schema: {
type: 'array',
items: {
type: 'object',
properties: {
address: { type: 'string' },
price: { type: 'number' },
property_type: { type: 'string' },
bedrooms: { type: 'integer', minimum: 0 },
bathrooms: { type: 'number', minimum: 0 },
square_feet: { type: 'integer', minimum: 0 },
lot_size: { type: 'string' },
year_built: { type: 'integer' },
agent_name: { type: 'string' },
agent_contact: { type: 'string' },
days_on_market: { type: 'integer' },
description: { type: 'string', maxLength: 200 }
},
required: ['address', 'price', 'property_type']
}
}
},
{
headers: {
'Authorization': 'Bearer sk-nr-your-api-key',
'Content-Type': 'application/json'
}
}
);
return response.data;
} catch (error) {
console.error('Extraction failed:', error.response?.data || error.message);
throw error;
}
}
// Usage
const propertyUrls = [
'https://realtor.com/city/apartments',
'https://realtor.com/city/houses',
'https://realtor.com/city/condos'
];
extractRealEstateListings(propertyUrls)
.then(results => console.log(JSON.stringify(results, null, 2)))
.catch(error => console.error(error));
News and Content Aggregation
Article Extraction
Extract articles from news websites:
import requests
from datetime import datetime
import csv
class NewsAggregator:
def __init__(self, api_key):
self.api_key = api_key
self.base_url = "https://api.nextrows.com/v1/extract"
def extract_articles(self, news_urls):
response = requests.post(
self.base_url,
headers={"Authorization": f"Bearer {self.api_key}"},
json={
"type": "url",
"data": news_urls,
"prompt": """Extract article information:
- Headline (main title)
- Author name(s)
- Publication date (in YYYY-MM-DD format if possible)
- Article category/section
- Article summary (first 300 characters of content)
- Article URL
- Number of comments (if visible)
- Tags or keywords (if available)""",
"schema": {
"type": "array",
"items": {
"type": "object",
"properties": {
"headline": {"type": "string"},
"author": {"type": "string"},
"publication_date": {"type": "string"},
"category": {"type": "string"},
"summary": {"type": "string", "maxLength": 300},
"url": {"type": "string", "format": "uri"},
"comment_count": {"type": "integer"},
"tags": {"type": "array", "items": {"type": "string"}}
},
"required": ["headline", "summary"]
}
}
}
)
return response.json()
def save_to_csv(self, articles, filename):
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
if not articles:
return
fieldnames = articles[0].keys()
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for article in articles:
# Handle list fields for CSV
if 'tags' in article and isinstance(article['tags'], list):
article['tags'] = ', '.join(article['tags'])
writer.writerow(article)
# Usage example
aggregator = NewsAggregator("sk-nr-your-api-key")
news_sources = [
"https://techcrunch.com/ai",
"https://news.ycombinator.com",
"https://www.theverge.com/tech"
]
result = aggregator.extract_articles(news_sources)
if result.get('success'):
aggregator.save_to_csv(result['data'], f"news_{datetime.now().strftime('%Y%m%d')}.csv")
print(f"Extracted {len(result['data'])} articles")
Social Media and Directory Scraping
Business Directory Extraction
Extract business information from directories:
{
"type": "url",
"data": ["https://business-directory.com/restaurants"],
"prompt": "Extract business listings with: business name, address, phone number, website URL, business category, rating, number of reviews, hours of operation, and price range indicator",
"schema": {
"type": "array",
"items": {
"type": "object",
"properties": {
"business_name": {"type": "string"},
"address": {"type": "string"},
"phone": {"type": "string"},
"website": {"type": "string", "format": "uri"},
"category": {"type": "string"},
"rating": {"type": "number", "minimum": 0, "maximum": 5},
"review_count": {"type": "integer", "minimum": 0},
"hours": {"type": "string"},
"price_range": {"type": "string"}
},
"required": ["business_name", "address"]
}
}
}
Financial Data Extraction
Stock Market Data
Extract financial information:
import requests
import pandas as pd
def extract_stock_data(stock_pages):
response = requests.post(
"https://api.nextrows.com/v1/extract",
headers={"Authorization": "Bearer sk-nr-your-api-key"},
json={
"type": "url",
"data": stock_pages,
"prompt": """Extract stock information:
- Company name
- Stock symbol/ticker
- Current price (as number)
- Price change (as number with + or -)
- Percentage change (as number)
- Market cap
- Trading volume
- 52-week high
- 52-week low
- P/E ratio""",
"schema": {
"type": "array",
"items": {
"type": "object",
"properties": {
"company_name": {"type": "string"},
"symbol": {"type": "string"},
"current_price": {"type": "number"},
"price_change": {"type": "number"},
"percentage_change": {"type": "number"},
"market_cap": {"type": "string"},
"volume": {"type": "string"},
"week_52_high": {"type": "number"},
"week_52_low": {"type": "number"},
"pe_ratio": {"type": "number"}
},
"required": ["company_name", "symbol", "current_price"]
}
}
}
)
if response.json().get('success'):
# Convert to pandas DataFrame for analysis
df = pd.DataFrame(response.json()['data'])
return df
else:
raise Exception(f"Extraction failed: {response.json()}")
# Usage
stock_urls = [
"https://finance.yahoo.com/quote/AAPL",
"https://finance.yahoo.com/quote/GOOGL",
"https://finance.yahoo.com/quote/MSFT"
]
stock_df = extract_stock_data(stock_urls)
print(stock_df.head())
# Save to Excel
stock_df.to_excel('stock_data.xlsx', index=False)
Event and Conference Listings
Event Information Extraction
Extract event details from event listing sites:
curl -X POST https://api.nextrows.com/v1/extract \
-H "Authorization: Bearer sk-nr-your-api-key" \
-H "Content-Type: application/json" \
-d '{
"type": "url",
"data": ["https://eventbrite.com/d/ny--new-york/tech-events/"],
"prompt": "Extract event details: event name, organizer, date and time, location/venue, ticket price, event description (first 200 chars), number of attendees, event category, and registration URL"
}'
Academic and Research Data
Research Paper Information
Extract academic paper details:
def extract_research_papers(paper_urls):
return requests.post(
"https://api.nextrows.com/v1/extract",
headers={"Authorization": "Bearer sk-nr-your-api-key"},
json={
"type": "url",
"data": paper_urls,
"prompt": """Extract academic paper information:
- Paper title
- Authors (all authors)
- Publication date
- Journal or conference name
- Abstract (full text)
- Keywords/tags
- Citation count
- DOI
- PDF download link""",
"schema": {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"authors": {"type": "array", "items": {"type": "string"}},
"publication_date": {"type": "string"},
"journal": {"type": "string"},
"abstract": {"type": "string"},
"keywords": {"type": "array", "items": {"type": "string"}},
"citation_count": {"type": "integer"},
"doi": {"type": "string"},
"pdf_url": {"type": "string"}
},
"required": ["title", "authors"]
}
}
}
).json()
# Usage for multiple research databases
research_urls = [
"https://arxiv.org/list/cs.AI/recent",
"https://scholar.google.com/scholar?q=machine+learning",
"https://ieeexplore.ieee.org/search/searchresult.jsp?newsearch=true&queryText=artificial+intelligence"
]
Batch Processing Examples
Processing Large Datasets
Handle large-scale extractions efficiently:
import requests
import time
from typing import List, Dict, Any
import logging
class BatchExtractor:
def __init__(self, api_key: str, batch_size: int = 10, delay: float = 1.0):
self.api_key = api_key
self.batch_size = batch_size
self.delay = delay
self.base_url = "https://api.nextrows.com/v1/extract"
# Setup logging
logging.basicConfig(level=logging.INFO)
self.logger = logging.getLogger(__name__)
def extract_batch(self, urls: List[str], prompt: str, schema: Dict = None) -> List[Dict[Any, Any]]:
"""Extract data from a batch of URLs"""
request_data = {
"type": "url",
"data": urls,
"prompt": prompt
}
if schema:
request_data["schema"] = schema
response = requests.post(
self.base_url,
headers={"Authorization": f"Bearer {self.api_key}"},
json=request_data
)
if response.status_code == 200:
return response.json().get('data', [])
else:
self.logger.error(f"Batch failed: {response.status_code} - {response.text}")
return []
def process_large_dataset(self, all_urls: List[str], prompt: str, schema: Dict = None) -> List[Dict[Any, Any]]:
"""Process a large dataset in batches"""
all_results = []
total_batches = (len(all_urls) + self.batch_size - 1) // self.batch_size
self.logger.info(f"Processing {len(all_urls)} URLs in {total_batches} batches")
for i in range(0, len(all_urls), self.batch_size):
batch_urls = all_urls[i:i + self.batch_size]
batch_num = (i // self.batch_size) + 1
self.logger.info(f"Processing batch {batch_num}/{total_batches} ({len(batch_urls)} URLs)")
try:
batch_results = self.extract_batch(batch_urls, prompt, schema)
all_results.extend(batch_results)
self.logger.info(f"Batch {batch_num} completed: {len(batch_results)} items extracted")
# Rate limiting
if i + self.batch_size < len(all_urls):
time.sleep(self.delay)
except Exception as e:
self.logger.error(f"Batch {batch_num} failed: {e}")
continue
self.logger.info(f"Total extraction completed: {len(all_results)} items")
return all_results
# Usage example
extractor = BatchExtractor("sk-nr-your-api-key", batch_size=5, delay=2.0)
# Process 100 product URLs
product_urls = [f"https://store.com/product/{i}" for i in range(1, 101)]
results = extractor.process_large_dataset(
product_urls,
"Extract product name, price, and availability",
schema={
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"price": {"type": "number"},
"available": {"type": "boolean"}
}
}
}
)
# Save results
import json
with open('products.json', 'w') as f:
json.dump(results, f, indent=2)
Best Practices from Examples
1. Always Use Schema Validation in Production
# Without validation (not recommended)
{"prompt": "Extract product data"}
# With validation (recommended)
{
"prompt": "Extract product data",
"schema": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string", "minLength": 1},
"price": {"type": "number", "minimum": 0}
},
"required": ["name", "price"]
}
}
}
2. Handle Errors Gracefully
def safe_extract(url, prompt, max_retries=3):
for attempt in range(max_retries):
try:
response = requests.post(api_url, json={...})
if response.status_code == 200:
return response.json()
except Exception as e:
if attempt == max_retries - 1:
raise e
time.sleep(2 ** attempt) # Exponential backoff
return None
3. Be Specific with Prompts
# ❌ Vague
"Extract data from this page"
# ✅ Specific
"Extract product name, price in USD, customer rating (1-5 stars), and stock status (in stock/out of stock) from each product listing"
These examples demonstrate real-world usage patterns. Adapt the prompts and schemas to match your specific data requirements and website structures.