Solution 1 :

This will get just the body:

function simpleScraper(url) {
  let html=UrlFetchApp.fetch(url).getContentText();
  html=html.slice(html.indexOf('<body>')+6,html.indexOf('</body>'));
  html='<textarea rows="12" cols="30">' + html + '</textarea>';
  SpreadsheetApp.getUi().showModelessDialog(HtmlService.createHtmlOutput(html), "HTML");
}

This might help to get the script out:

function simpleScraper(url='http://jimesteban.com') {
  let s=UrlFetchApp.fetch(url).getContentText();
  const p1='<textarea rows="12" cols="30">';
  const p2='</textarea>';
  let body=s.slice(s.indexOf('<body>')+6,s.indexOf('</body>'));
  let html;
  if(body.indexOf('<script')==-1) {
    html=body;
  }else{
    let s1=body.slice(0,body.indexOf('<script'));
    let s2=body.slice(body.indexOf('</script>')+8);
    html=s1 + s2;
  }
  if(html.indexOf('<style')!=-1) {
    let a1=html.slice(0,html.indexOf('<style'));
    let a2=html.slice(html.indexOf('</style>')+8);
    html=a1 + a2;
  }
  SpreadsheetApp.getUi().showModelessDialog(HtmlService.createHtmlOutput(p1+html+p2).setWidth(800).setHeight('600'), "HTML");
}

Solution 2 :

How about creating a TEMPORARY sheet, use setFormula('=IMPORTXML("URL","XPATH"), retrieve the data and delete the sheet once the operation is finished?

Example:

function getData() {
    var activeSpreadsheet = SpreadsheetApp.getActiveSpreadsheet();
    var newSheet = activeSpreadsheet.getSheetByName("Temporary");
    //check if existing, delete if yes
    if (newSheet != null) {
      activeSpreadsheet.deleteSheet(newSheet);
    }
    //create new sheet with name Temporary
    newSheet = activeSpreadsheet.insertSheet();
    newSheet.setName("Temporary");
    //use method:setFormula and use IMPORTXML
    newSheet.getRange("A1").setFormula('=IMPORTXML("http://www.google.com", "//a/@href")');

    //retrieve and print data
    Logger.log(newSheet.getDataRange().getValues());
    //delete Temporary sheet
    activeSpreadsheet.deleteSheet(newSheet);
}

Output:

enter image description here

Reference:

method:setFormula

Problem :

Basically, I’m building a scraper to get the contents of a web page, so that then I can count a certain keyword occurrence and get that keyword density from the page.
I can already count the number of occurrences, the issue I’m having is that UrlFetchApp.fetch pulls the source code I ended up counting not only the words on what would be the frontend of the page but also whatever is on the source code.

for that, I wanted to filter the page so that I only grab the actual content of the page.

I would love to get the content just as =importxml("[page]","//div[@id='main']") does in google sheets but I have no idea on how to apply that logic on code, I’m using google scripts so, using querySelector, or getElementsByTag is not an option, unfortunately.

the reason I’m trying to do this in code and not by using the google sheets formula is that this would be applied to a big number of pages.

var response = UrlFetchApp.fetch(url, options) 
      var html = response.getContentText();

var body = html.substring(html.lastIndexOf("<body") + 1, html.lastIndexOf("</body>"));  //removed everything from above the <body> tag
      
   
var lowerBody = body.toString().toLowerCase()  // changing the contents to lower case so is not case sensitive.

var kw = kw.toString().toLowerCase()  // same as above but for the keyword

 var filter = ['<p>','li','span','h1','h2','h3','h4','h5','h6'] //what I want to get from the page

   
        var newBody = new Array;     
      var filter = ['p','li','span','h1','h2','h3','h4','h5','h6']
      
      
    for(var i = 0; i< filter.length; i++){
        
      if(filter[i] == 'p'){
        if(lowerBody.match(/<p>(.*?)</p>/g).length != 0){
          newBody[i] = lowerBody.match(/<p>(.*?)</p>/g).map(function(val){
            return val.replace(/</?p>/g,'');
          });       
        }
  
      }
      else if(filter[i] == 'li'){
        if(lowerBody.match(/<p>(.*?)</p>/g).length != 0){
          newBody[i] = lowerBody.match(/<li>(.*?)</li>/g).map(function(val){
            return val.replace(/</?li>/g,'');
          });
        }
  
      }
      else if(filter[i] == 'span'){
        if(lowerBody.match(/<p>(.*?)</p>/g).length != 0){
          newBody[i] = lowerBody.match(/<span>(.*?)</span>/g).map(function(val){
            return val.replace(/</?span>/g,'');
          });
        }
      }
      else if(filter[i] == 'h1'){
        if(lowerBody.match(/<p>(.*?)</p>/g).length != 0){
          
          newBody[i] = lowerBody.match(/<h1(.*?)</h1>/g).map(function(val){
            return val.replace(/</?h1/g,'');
          });
        }
      }
      else if(filter[i] == 'h2'){
        if(lowerBody.match(/<p>(.*?)</p>/g).length != 0){
          
          newBody[i] = lowerBody.match(/<h2(.*?)</h2>/g).map(function(val){
            return val.replace(/</?h2/g,'');
          });
        }
      }
      else if(filter[i] == 'h3'){
        if(lowerBody.match(/<p>(.*?)</p>/g).length != 0){
          newBody[i] = lowerBody.match(/<h3(.*?)</h3>/g).map(function(val){
            return val.replace(/</?h3/g,'');
          });
        }
      }
      else if(filter[i] == 'h4'){
        if(lowerBody.match(/<p>(.*?)</p>/g).length != 0){
          newBody[i] = lowerBody.match(/<h4(.*?)</h4>/g).map(function(val){
            return val.replace(/</?h4/g,'');
          });
        }
      }
        else{
        
        }
      }```

I know my code is bad so I'm willing to scrape all this if there's a better way of accomplishing this.
Thanks in advance.

Comments

Comment posted by Mario Solorza

Thanks for replying~ I gave this a try and is somewhat similar to what I have right now, and unfortunately is does not work for this case. Thanks for taking your time on this.

Comment posted by Mario Solorza

Thanks for replying! aa this looks like a good way of doing it, it might be kinda slow for what I’m working on, I’ll give it a try, looks like a great way of tackling this issue. Thanks again

By