Solution 1 :

Requests takes a dict of parameters and also has a json method, so this can be much cleaner.

import time
import requests


def scrape_site(self):
    self.items = []
    page = 1

    with requests.Session() as s:
        while True:
            params = {
              'page': page,
              'limit': 250
            }
        
            try:
                r = s.get(self.url, params=params, headers=self.headers, proxies=self.proxy, verify=False, timeout=20)
                r.raise_for_status()
                output = r.json()
                if not output:
                    break
                for product in output['products']:
                    product_item = {
                        'title': product['title'], 
                        'image': product['images'][0]['src'], 
                        'handle': product['handle'], 
                        'variants':product['variants']
                    }
                    self.items.append(product_item)
                logging.info(f'Successfully scraped page {page}')
                page += 1
                time.sleep(1)
                
            except Exception as e:
                logging.error(e)
                break

    return self.items

Problem :

So I’m new to coding in general, but for my first project I’m trying to create a monitor to monitor product changes to a Shopify site.

My method was grab publicly shared code online and work backwards from there to understand it, so I’ve got the following code in a wider class which seems to take the products.json by looping through the pages.

But when I load up https://www.hanon-shop.com/collections/all/products.json but then print my Items list below, the first few products are different, how does that make sense?

def scrape_site(self):
        """
        Scrapes the specified Shopify site and adds items to array
        :return: None
        """
        self.items = []
        s = rq.Session()
        page = 1
        while page > 0:
            try:
                html = s.get(self.url + '?page=' + str(page) + '&limit=250', headers=self.headers, proxies=self.proxy, verify=False, timeout=20)
                output = json.loads(html.text)['products']
                if output == []:
                    page = 0
                else:
                    for product in output:
                        product_item = [{'title': product['title'], 'image': product['images'][0]['src'], 'handle': product['handle'], 'variants':product['variants']}]
                        self.items.append(product_item)
                    logging.info(msg='Successfully scraped site')
                    page += 1
            except Exception as e:
                logging.error(e)
                page = 0
            time.sleep(0.5)
        s.close()

By