I believe you can use your Firefox browser but have not tested with it. I use chrome so if you want to use chromedriver check the version of your browser and download the right one, also add it to your system path. The only thing with this approach is that it open a browser window until the page is loaded (because we are waiting for the javascript to generate the matches data). If you need anything else let me know. Good luck!
https://chromedriver.chromium.org/downloads
Known issues: Sometimes it will throw index out of range when retrieve matches data. This is something I am looking to it because it look like sometimes the xpath on each link change a little .
from selenium import webdriver
from lxml import html
from lxml.html import HtmlElement
def test():
# Here we specified the urls to for testing purpose
urls = ['https://www.mismarcadores.com/partido/noIPZ3Lj/#h2h;overall'
]
# a loop to go over all the urls
for url in urls:
# We will print the string and format it with the url we are currently checking, Also we will print the
# result of the function get_last_5(url) where url is the current url in the for loop.
print("Scores after this match {u}".format(u=url), get_last_5(url))
def get_last_5(url):
print("processing {u}, please wait...".format(u=url))
# here we get a instance of the webdriver
browser = webdriver.Chrome()
# now we pass the url we want to get
browser.get(url)
# in this variable, we will "store" the html data as a string. We get it from here because we need to wait for
# the page to load and execute their javascript code in order to generate the matches data.
innerHTML = browser.execute_script("return document.body.innerHTML")
# Now we will assign this to a variable of type HtmlElement
tree: HtmlElement = html.fromstring(innerHTML)
# the following variables: first_team,second_team,match_date and rows are obtained via xpath method(). To get the
# xpath go to chrome browser,open it and load one of the url to check the DOM. Now if you wish to check the xpath
# of each of this variables (elements in case of html), right click on the element->click inspect->the inspect
# panel will appear->the clicked element wil appear selected on the inspect panel->right click on it->Copy->Copy
# Xpath. first_team,second_team and match_date are obtained from the "title" section. Rows are obtained from the
# table of last matches in the tbody content
# When using xpath it will return a list of HtmElement because it will try to find all the elements that match our
# xpath, so that is why we use [0] (to get the first element of the list). This will give use access to a
# HtmlElement object so now we can access its text attribute.
first_team = tree.xpath('//*[@id="flashscore"]/div[1]/div[1]/div[2]/div/div/a')[0].text
print((type(first_team)))
second_team = tree.xpath('//*[@id="flashscore"]/div[1]/div[3]/div[2]/div/div/a')[0].text
# [0:8] is used to slice the string because in the title it contains also the time of the match ie.(10.08.2020
# 13:00) . To use it for comparing each row we need only (10.08.20), so we get from position 0, 8 characters ([0:8])
match_date = tree.xpath('//*[@id="utime"]')[0].text[0:8]
# when getting the first element with [0], we get a HtmlElement object( which is the "table" that have all matches
# data). so we want to get all the children of it, which are all the "rows(elements)" inside it. getchildren()
# will also return a list of object of type HtmlElement. In this case we are also slicing the list with [:-1]
# because the last element inside the "table" is the button "Mostar mas partidos", so we want to take that out.
rows = tree.xpath('//*[@id="tab-h2h-overall"]/div[1]/table/tbody')[0].getchildren()[:-1]
# we quit the browser since we do not need this anymore, we could do it after assigning innerHtml, but no harm
# doing it here unless you wish to close it before doing all this assignment of variables.
browser.quit()
# this match_position variable will be the position of the match we currently have in the title.
match_position = None
# Now we will iterate over the rows and find the match. range(len(rows)) is just to get the count of rows to know
# until when to stop iterating.
for i in range(len(rows)):
# now we use the is_match function with the following parameter: first_team,second team, match_date and the
# current row which is row[i]. if the function return true we found the match position and we assign (i+1) to
# the match_position variable. i+1 because we iterate from 0.
if is_match(first_team, second_team, match_date, rows[i]):
match_position = i + 1
# now we stop the for no need to go further when we find it.
break
# Since we only want the following 5 matches score, we need to check if we have 5 rows beneath our match. If
# adding 5 from the match position is less than the number of rows then we can do it, if not we will only get the
# rows beneath it(maybe 0,1,2,3 or 4 rows)
if (match_position + 5) < len(rows):
# Again we are slicing the list, in this case 2 times [match_position:] (take out all the rows before the
# match position), then from the new list obtained from that we do [:5] which is start from the 0 position
# and stop on 5 [start:stop]. we use rows=rows beacause when slicing you get a new list so you can not do
# rows[match_position:][:5] you need to assign it to a variable. I am using same variable but you can assign
# it to a new one if you wish.
rows = rows[match_position:][:5]
else:
# since we do not have enough rows, just get the rows beneath our position.
rows = rows[match_position:len(rows)]
# Now to get the list of scores we are using a list comprehension in here but I will explain it as a for loop.
# Before that, you need to know that each row(<tr> element in html) has 6 td elements inside it, the number 5 is
# the score of the match. then inside each "score element" we have a span element and then a strong element,
# something like
# <tr>
# <td></td>
# <td></td>
# <td></td>
# <td></td>
# <td><span><strong>1:2</strong></span></td>.
# <td></td>
# </tr>
# Now, That been said, since each row is a HtmlElement object , we can go in a for loop as following:
scores = []
for row in rows:
data = row.getchildren()[4].getchildren()[0].text_content()
# not the best way but we will get al the text content on the element, in this case the span element,
# if the string has more than 5 characters i.e. "1 : 2" then we will take as if it is i.e. "1 : 2(0 : 1)". So
# in this case we want to slice it from the 2nd character from right to left and get 5 characters from that
# position.
# using a ternary expression here, if the length of the string is equal to 5 then this is our score,
# if not then we have to slice it and get the last part, from -6 which is the white space before then 2 (in
# our example) to -1 (which is the 1 before the last ')' ).
score = data if len(data) == 5 else data[-6:-1]
scores.append(score)
print("finished processing {u}.".format(u=url))
# now we return the scores
return scores
def is_match(t1, t2, match_date, row):
# from each row we want to compare, t1,t2,match_date (this are obtained from the title) with the rows team1,
# team2 and date. Each row has 6 element inside it. Please read all the code on get_last_5 before reading this
# explanation. so the for this row, date is in position 0, team1 in 2, team2 in 3.
# <td><span>10.03.20</span></td>
date = row.getchildren()[0].getchildren()[0].text
# <td><span>TeamName</span></td> (when the team lost) or
# <td><span><strong>TeamName</strong></span></td> (when the team won)
team1element = row.getchildren()[2].getchildren()[0] # this is the span element
# using a ternary expression (condition_if_true if condition else condition_if_false)
# https://book.pythontips.com/en/latest/ternary_operators.html
# if span element have childrens , (getchildren()>0) then the team name is team1element.getchildren()[0].text
# which is the text of the strong element, if not the jsut get the text from the span element.
mt1 = team1element.getchildren()[0].text if len(team1element.getchildren()) > 0 else team1element.text
# repeat the same as team 1
team2element = row.getchildren()[3].getchildren()[0]
mt2 = team2element.getchildren()[0].text if len(team2element.getchildren()) > 0 else team2element.text
# basically we can compare only the date, but jsut to be sure we compare the names also. So, if the dates and the
# names are the same this is our match row.
if match_date == date and t1 == mt1 and t2 == mt2:
# we found it so return true
return True
# if not the same then return false
return False