This will get just the body:
function simpleScraper(url) {
let html=UrlFetchApp.fetch(url).getContentText();
html=html.slice(html.indexOf('<body>')+6,html.indexOf('</body>'));
html='<textarea rows="12" cols="30">' + html + '</textarea>';
SpreadsheetApp.getUi().showModelessDialog(HtmlService.createHtmlOutput(html), "HTML");
}
This might help to get the script out:
function simpleScraper(url='http://jimesteban.com') {
let s=UrlFetchApp.fetch(url).getContentText();
const p1='<textarea rows="12" cols="30">';
const p2='</textarea>';
let body=s.slice(s.indexOf('<body>')+6,s.indexOf('</body>'));
let html;
if(body.indexOf('<script')==-1) {
html=body;
}else{
let s1=body.slice(0,body.indexOf('<script'));
let s2=body.slice(body.indexOf('</script>')+8);
html=s1 + s2;
}
if(html.indexOf('<style')!=-1) {
let a1=html.slice(0,html.indexOf('<style'));
let a2=html.slice(html.indexOf('</style>')+8);
html=a1 + a2;
}
SpreadsheetApp.getUi().showModelessDialog(HtmlService.createHtmlOutput(p1+html+p2).setWidth(800).setHeight('600'), "HTML");
}
How about creating a TEMPORARY sheet, use setFormula('=IMPORTXML("URL","XPATH")
, retrieve the data and delete the sheet once the operation is finished?
Example:
function getData() {
var activeSpreadsheet = SpreadsheetApp.getActiveSpreadsheet();
var newSheet = activeSpreadsheet.getSheetByName("Temporary");
//check if existing, delete if yes
if (newSheet != null) {
activeSpreadsheet.deleteSheet(newSheet);
}
//create new sheet with name Temporary
newSheet = activeSpreadsheet.insertSheet();
newSheet.setName("Temporary");
//use method:setFormula and use IMPORTXML
newSheet.getRange("A1").setFormula('=IMPORTXML("http://www.google.com", "//a/@href")');
//retrieve and print data
Logger.log(newSheet.getDataRange().getValues());
//delete Temporary sheet
activeSpreadsheet.deleteSheet(newSheet);
}
Output:

Reference:
method:setFormula
Basically, I’m building a scraper to get the contents of a web page, so that then I can count a certain keyword occurrence and get that keyword density from the page.
I can already count the number of occurrences, the issue I’m having is that UrlFetchApp.fetch
pulls the source code I ended up counting not only the words on what would be the frontend of the page but also whatever is on the source code.
for that, I wanted to filter the page so that I only grab the actual content of the page.
I would love to get the content just as =importxml("[page]","//div[@id='main']")
does in google sheets but I have no idea on how to apply that logic on code, I’m using google scripts so, using querySelector
, or getElementsByTag
is not an option, unfortunately.
the reason I’m trying to do this in code and not by using the google sheets formula is that this would be applied to a big number of pages.
var response = UrlFetchApp.fetch(url, options)
var html = response.getContentText();
var body = html.substring(html.lastIndexOf("<body") + 1, html.lastIndexOf("</body>")); //removed everything from above the <body> tag
var lowerBody = body.toString().toLowerCase() // changing the contents to lower case so is not case sensitive.
var kw = kw.toString().toLowerCase() // same as above but for the keyword
var filter = ['<p>','li','span','h1','h2','h3','h4','h5','h6'] //what I want to get from the page
var newBody = new Array;
var filter = ['p','li','span','h1','h2','h3','h4','h5','h6']
for(var i = 0; i< filter.length; i++){
if(filter[i] == 'p'){
if(lowerBody.match(/<p>(.*?)</p>/g).length != 0){
newBody[i] = lowerBody.match(/<p>(.*?)</p>/g).map(function(val){
return val.replace(/</?p>/g,'');
});
}
}
else if(filter[i] == 'li'){
if(lowerBody.match(/<p>(.*?)</p>/g).length != 0){
newBody[i] = lowerBody.match(/<li>(.*?)</li>/g).map(function(val){
return val.replace(/</?li>/g,'');
});
}
}
else if(filter[i] == 'span'){
if(lowerBody.match(/<p>(.*?)</p>/g).length != 0){
newBody[i] = lowerBody.match(/<span>(.*?)</span>/g).map(function(val){
return val.replace(/</?span>/g,'');
});
}
}
else if(filter[i] == 'h1'){
if(lowerBody.match(/<p>(.*?)</p>/g).length != 0){
newBody[i] = lowerBody.match(/<h1(.*?)</h1>/g).map(function(val){
return val.replace(/</?h1/g,'');
});
}
}
else if(filter[i] == 'h2'){
if(lowerBody.match(/<p>(.*?)</p>/g).length != 0){
newBody[i] = lowerBody.match(/<h2(.*?)</h2>/g).map(function(val){
return val.replace(/</?h2/g,'');
});
}
}
else if(filter[i] == 'h3'){
if(lowerBody.match(/<p>(.*?)</p>/g).length != 0){
newBody[i] = lowerBody.match(/<h3(.*?)</h3>/g).map(function(val){
return val.replace(/</?h3/g,'');
});
}
}
else if(filter[i] == 'h4'){
if(lowerBody.match(/<p>(.*?)</p>/g).length != 0){
newBody[i] = lowerBody.match(/<h4(.*?)</h4>/g).map(function(val){
return val.replace(/</?h4/g,'');
});
}
}
else{
}
}```
I know my code is bad so I'm willing to scrape all this if there's a better way of accomplishing this.
Thanks in advance.
Thanks for replying~ I gave this a try and is somewhat similar to what I have right now, and unfortunately is does not work for this case. Thanks for taking your time on this.
Thanks for replying! aa this looks like a good way of doing it, it might be kinda slow for what I’m working on, I’ll give it a try, looks like a great way of tackling this issue. Thanks again