amazon_shop / predict_help.py
jyang's picture
Code clean up
730ca01
raw
history blame contribute delete
No virus
16.7 kB
from bs4 import BeautifulSoup
from bs4.element import Comment
from enum import Enum
import re, time
from urllib.parse import urlencode
import json, requests, torch
class Page(Enum):
DESC = "description"
FEATURES = "features"
ITEM_PAGE = "item_page"
RESULTS = "results"
REVIEWS = "reviews"
SEARCH = "search"
SUB_PAGE = "item_sub_page"
HEADER_ = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36'
DEBUG_HTML = "temp.html"
NUM_PROD_LIMIT = 10
WEBSHOP_URL = "http://3.83.245.205:3000"
WEBSHOP_SESSION = "abc"
def parse_results_ebay(query, page_num=None, verbose=True):
query_string = '+'.join(query.split())
page_num = 1 if page_num is None else page_num
url = f'https://www.ebay.com/sch/i.html?_nkw={query_string}&_pgn={page_num}'
if verbose:
print(f"Search Results URL: {url}")
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
soup = BeautifulSoup(webpage.text, 'html.parser')
products = soup.select('.s-item__wrapper.clearfix')
results = []
for item in products[:NUM_PROD_LIMIT]:
title = item.select_one('.s-item__title').text.strip()
if "shop on ebay" in title.lower():
# Skip "Shop on ebay" product title
continue
link = item.select_one('.s-item__link')['href']
asin = link.split("?")[0][len("https://www.ebay.com/itm/"):]
try:
price = item.select_one('.s-item__price').text
if "to" in price:
prices = price.split(" to ")
price = [p.strip("$") for p in prices]
except:
price = None
results.append({
"asin": asin,
"Title": title,
"Price": price
})
if verbose:
print(f"Scraped {len(results)} products")
return results
def parse_item_page_ebay(asin, verbose=True):
product_dict = {}
product_dict["asin"] = asin
url = f"https://www.ebay.com/itm/{asin}"
if verbose:
print(f"Item Page URL: {url}")
begin = time.time()
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
end = time.time()
if verbose:
print(f"Item page scraping took {end-begin} seconds")
soup = BeautifulSoup(webpage.content, "html.parser")
# Title
try:
product_dict["Title"] = soup.find('h1', {'class': 'x-item-title__mainTitle'}).text.strip()
except:
product_dict["Title"] = "N/A"
# Price: Get price string, extract decimal numbers from string
try:
price_str = soup.find('div', {'class': 'mainPrice'}).text
prices = re.findall('\d*\.?\d+', price_str)
product_dict["Price"] = prices[0]
except:
product_dict["Price"] = "N/A"
# Main Image
try:
img_div = soup.find('div', {'id': 'mainImgHldr'})
img_link = img_div.find('img', {'id': 'icImg'})["src"]
product_dict["MainImage"] = img_link
except:
product_dict["MainImage"] = ""
# Rating
try:
rating = soup.find('span', {'class': 'reviews-star-rating'})["title"].split()[0]
except:
rating = None
product_dict["Rating"] = rating
# Options
options, options_to_images = {}, {} # TODO: options_to_images possible?
try:
option_blocks = soup.findAll('select', {'class': 'msku-sel'})
for block in option_blocks:
name = block["name"].strip().strip(":")
option_tags = block.findAll("option")
opt_list = []
for option_tag in option_tags:
if "select" not in option_tag.text.lower():
# Do not include "- select -" (aka `not selected`) choice
opt_list.append(option_tag.text)
options[name] = opt_list
except:
options = {}
product_dict["options"], product_dict["option_to_image"] = options, options_to_images
# Description
desc = None
try:
# Ebay descriptions are shown in `iframe`s
desc_link = soup.find('iframe', {'id': 'desc_ifr'})["src"]
desc_webpage = requests.get(desc_link, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
desc_soup = BeautifulSoup(desc_webpage.content, "html.parser")
desc = ' '.join(desc_soup.text.split())
except:
desc = "N/A"
product_dict["Description"] = desc
# Features
features = None
try:
features = soup.find('div', {'class': 'x-about-this-item'}).text
except:
features = "N/A"
product_dict["BulletPoints"] = features
return product_dict
def parse_results_ws(query, page_num=None, verbose=True):
query_string = '+'.join(query.split())
page_num = 1 if page_num is None else page_num
url = (
f'{WEBSHOP_URL}/search_results/{WEBSHOP_SESSION}/'
f'{query_string}/{page_num}'
)
if verbose:
print(f"Search Results URL: {url}")
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
soup = BeautifulSoup(webpage.content, 'html.parser')
products = soup.findAll('div', {'class': 'list-group-item'})
results = []
for product in products:
asin = product.find('a', {'class': 'product-link'})
title = product.find('h4', {'class': 'product-title'})
price = product.find('h5', {'class': 'product-price'})
if "\n" in title:
title = title.text.split("\n")[0].strip()
else:
title = title.text.strip().strip("\n")
if "to" in price.text:
# Parse if price presented as range
prices = price.text.split(" to ")
price = [float(p.strip().strip("\n$")) for p in prices]
else:
price = float(price.text.strip().strip("\n$"))
results.append({
"asin": asin.text,
"Title": title,
"Price": price
})
if verbose:
print(f"Scraped {len(results)} products")
return results
def parse_item_page_ws(asin, query, page_num, options, verbose=True):
product_dict = {}
product_dict["asin"] = asin
query_string = '+'.join(query.split())
options_string = json.dumps(options)
url = (
f'{WEBSHOP_URL}/item_page/{WEBSHOP_SESSION}/'
f'{asin}/{query_string}/{page_num}/{options_string}'
)
if verbose:
print(f"Item Page URL: {url}")
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
soup = BeautifulSoup(webpage.content, 'html.parser')
# Title, Price, Rating, and MainImage
product_dict["Title"] = soup.find('h2').text
h4_headers = soup.findAll("h4")
for header in h4_headers:
text = header.text
if "Price" in text:
product_dict["Price"] = text.split(":")[1].strip().strip("$")
elif "Rating" in text:
product_dict["Rating"] = text.split(":")[1].strip()
product_dict["MainImage"] = soup.find('img')['src']
# Options
options, options_to_image = {}, {}
option_blocks = soup.findAll("div", {'class': 'radio-toolbar'})
for block in option_blocks:
name = block.find("input")["name"]
labels = block.findAll("label")
inputs = block.findAll("input")
opt_list = []
for label, input in zip(labels, inputs):
opt = label.text
opt_img_path = input["onclick"].split("href=")[1].strip('\';')
opt_img_url = f'{WEBSHOP_URL}{opt_img_path}'
opt_list.append(opt)
options_to_image[opt] = opt_img_url
options[name] = opt_list
product_dict["options"] = options
product_dict["option_to_image"] = options_to_image
# Description
url = (
f'{WEBSHOP_URL}/item_sub_page/{WEBSHOP_SESSION}/'
f'{asin}/{query_string}/{page_num}/Description/{options_string}'
)
if verbose:
print(f"Item Description URL: {url}")
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
soup = BeautifulSoup(webpage.content, 'html.parser')
product_dict["Description"] = soup.find(name="p", attrs={'class': 'product-info'}).text.strip()
# Features
url = (
f'{WEBSHOP_URL}/item_sub_page/{WEBSHOP_SESSION}/'
f'{asin}/{query_string}/{page_num}/Features/{options_string}'
)
if verbose:
print(f"Item Features URL: {url}")
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
soup = BeautifulSoup(webpage.content, 'html.parser')
bullets = soup.find(name="ul").findAll(name="li")
product_dict["BulletPoints"] = '\n'.join([b.text.strip() for b in bullets])
return product_dict
# Query -> Search Result ASINs
def parse_results_amz(query, page_num=None, verbose=True):
url = 'https://www.amazon.com/s?k=' + query.replace(" ", "+")
if page_num is not None:
url += "&page=" + str(page_num)
if verbose:
print(f"Search Results URL: {url}")
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
soup = BeautifulSoup(webpage.content, 'html.parser')
products = soup.findAll('div', {'data-component-type': 's-search-result'})
if products is None:
temp = open(DEBUG_HTML, "w")
temp.write(str(soup))
temp.close()
raise Exception("Couldn't find search results page, outputted html for inspection")
results = []
for product in products[:NUM_PROD_LIMIT]:
asin = product['data-asin']
title = product.find("h2", {'class': "a-size-mini"})
price_div = product.find("div", {'class': 's-price-instructions-style'})
price = price_div.find("span", {'class': 'a-offscreen'})
result = {
'asin': asin,
'Title': title.text.strip(),
'Price': price.text.strip().strip("$")
}
results.append(result)
if verbose:
print("Scraped", len(results), "products")
return results
# Scrape information of each product
def parse_item_page_amz(asin, verbose=True):
product_dict = {}
product_dict["asin"] = asin
url = f"https://www.amazon.com/dp/{asin}"
if verbose:
print("Item Page URL:", url)
begin = time.time()
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
end = time.time()
if verbose:
print(f"Item page scraping took {end-begin} seconds")
soup = BeautifulSoup(webpage.content, "html.parser")
# Title
try:
title = soup.find("span", attrs={"id": 'productTitle'})
title = title.string.strip().replace(',', '')
except AttributeError:
title = "N/A"
product_dict["Title"] = title
# Price
try:
parent_price_span = soup.find(name="span", class_="apexPriceToPay")
price_span = parent_price_span.find(name="span", class_="a-offscreen")
price = float(price_span.getText().replace("$", ""))
except AttributeError:
price = "N/A"
product_dict["Price"] = price
# Rating
try:
rating = soup.find(name="span", attrs={"id": "acrPopover"})
if rating is None:
rating = "N/A"
else:
rating = rating.text
except AttributeError:
rating = "N/A"
product_dict["Rating"] = rating.strip("\n").strip()
# Features
try:
features = soup.find(name="div", attrs={"id": "feature-bullets"}).text
except AttributeError:
features = "N/A"
product_dict["BulletPoints"] = features
# Description
try:
desc_body = soup.find(name="div", attrs={"id": "productDescription_feature_div"})
desc_div = desc_body.find(name="div", attrs={"id": "productDescription"})
desc_ps = desc_div.findAll(name="p")
desc = " ".join([p.text for p in desc_ps])
except AttributeError:
desc = "N/A"
product_dict["Description"] = desc.strip()
# Main Image
try:
imgtag = soup.find("img", {"id":"landingImage"})
imageurl = dict(imgtag.attrs)["src"]
except AttributeError:
imageurl = ""
product_dict["MainImage"] = imageurl
# Options
options, options_to_image = {}, {}
try:
option_body = soup.find(name='div', attrs={"id": "softlinesTwister_feature_div"})
if option_body is None:
option_body = soup.find(name='div', attrs={"id": "twister_feature_div"})
option_blocks = option_body.findAll(name='ul')
for block in option_blocks:
name = json.loads(block["data-a-button-group"])["name"]
# Options
opt_list = []
for li in block.findAll("li"):
img = li.find(name="img")
if img is not None:
opt = img["alt"].strip()
opt_img = img["src"]
if len(opt) > 0:
options_to_image[opt] = opt_img
else:
opt = li.text.strip()
if len(opt) > 0:
opt_list.append(opt)
options[name.replace("_name", "").replace("twister_", "")] = opt_list
except AttributeError:
options = {}
product_dict["options"], product_dict["option_to_image"] = options, options_to_image
return product_dict
# Get text observation from html
# TODO[john-b-yang]: Similar to web_agent_site/envs/...text_env.py func def, merge?
def convert_html_to_text(html, simple=False, clicked_options=None, visited_asins=None):
def tag_visible(element):
ignore = {'style', 'script', 'head', 'title', 'meta', '[document]'}
return (
element.parent.name not in ignore and not isinstance(element, Comment)
)
html_obj = BeautifulSoup(html, 'html.parser')
texts = html_obj.findAll(text=True)
visible_texts = filter(tag_visible, texts)
if simple:
return ' [SEP] '.join(t.strip() for t in visible_texts if t != '\n')
else:
observation = ''
for t in visible_texts:
if t == '\n': continue
if t.parent.name == 'button': # button
processed_t = f'[button] {t} [button]'
elif t.parent.name == 'label': # options
if f'{t}' in clicked_options:
processed_t = f' [clicked button] {t} [clicked button]'
observation = f'You have clicked {t}.\n' + observation
else:
processed_t = f' [button] {t} [button]'
elif t.parent.get('class') == ["product-link"]: # asins
if f'{t}' in visited_asins:
processed_t = f'\n[clicked button] {t} [clicked button]'
else:
processed_t = f'\n[button] {t} [button]'
else: # regular, unclickable text
processed_t = str(t)
observation += processed_t + '\n'
return observation
# Get action from dict of values retrieved from html
def convert_dict_to_actions(page_type, products=None, asin=None, page_num=None) -> dict:
info = {"valid": []}
if page_type == Page.RESULTS:
info["valid"] = ['click[back to search]']
if products is None or page_num is None:
print(page_num)
print(products)
raise Exception('Provide `products`, `page_num` to get `results` valid actions')
# Decide whether to add `next >` as clickable based on # of search results
if len(products) > 10:
info["valid"].append('click[next >]')
# Add `< prev` as clickable if not first page of search results
if page_num > 1:
info["valid"].append('click[< prev]')
for product in products:
info["valid"].append("click[item - " + product["Title"] + "]")
if page_type == Page.ITEM_PAGE:
if products is None or asin is None:
raise Exception('Provide `products` and `asin` to get `item_page` valid actions')
info["valid"] = ['click[back to search]', 'click[< prev]', 'click[description]',\
'click[features]', 'click[buy now]'] # To do: reviews
if "options" in products[asin]:
for key, values in products[asin]["options"].items():
for value in values:
info["valid"].append("click[" + value + "]")
if page_type == Page.SUB_PAGE:
info["valid"] = ['click[back to search]', 'click[< prev]']
info['image_feat'] = torch.zeros(512)
return info