Solution 1 :

You could search for each part individually:

def bs2customList(soup): 
    fwb = soup.find("span", class_="font-weight-bold")

    #fdesc = [] if fwb is None else [( #(text, filter1, filter2, filter3)
        #d.get_text(strip=True), d.parent.get('class', []), d.parent.get('translate'), len(d.parent.select('span')) 
    #) for d in fwb.descendants if d.name is None and d.parent.name == 'span' and d.get_text(strip=True) ]
    fdesc = [] if fwb is None else [( #(text, filter1, filter2, filter3)
        str(d).strip(), d.parent.get('class', []), d.parent.get('translate'), len(d.parent.select('span')) 
    ) for d in fwb.descendants if 'NavigableString' in str(type(d)) and d.parent.name == 'span' and str(d).strip()]

    filters = [([], '', 0), (['ng-star-inserted'], None, 0), (['ng-star-inserted'], None, 1)]
    issueNum, ydate, num_ext = [[d[0] for d in fdesc if d[1:] == f] for f in filters]  
    num = num_ext[0].split()[0] if num_ext else []
    ext = num_ext[0].split()[1:] if num_ext else []
  
    return [(d[0] if d else None) for d in [issueNum, num, ext, ydate]]

or maybe this is more understandable:

def bs2customList(soup): 
    fwb = soup.find("span", class_="font-weight-bold")
    if fwb is None or not fwb.select('span.ng-star-inserted'): 
        return [ None ]*4

    snsi = fwb.select('span.ng-star-inserted') 
    
    snsi1 = [t for t in snsi if t.select('span[translate]')]
    if snsi1 != []:
        issueNum = snsi1[0].select_one('span[translate]').get_text(strip=True)

        #ext = [
        #    c.get_text(strip=True) for c in snsi1[0].children 
        #    if c.name is None and c.get_text(strip=True)
        #]
        ext = [
            str(c).strip() for c in snsi1[0].children 
            if 'NavigableString' in str(type(c)) and str(c).strip()
        ]

        if ext:
            ext = [e for e in ext[0].split() if e]

            # keep ext[0].isdigit() only if "number" is always integer
            if len(ext) > 1 and ext[0].isdigit(): 
                num = ext[0]
                ext = ' '.join(ext[1:])
            else: num, ext = ' '.join(ext), None
        else: num, ext = None, None
    else: issueNum, num, ext = [ None ]*3    

    ydate = [t for t in snsi if not t.select('span[translate]')]
    ydate = ydate[0].get_text(strip=True) if ydate else None

    return [issueNum, num, ext, ydate]


Whichever version of the function is used, with the below test set:

htmls = [
    '''
    <html>
      <body>
        <span _ngcontent-dna-c199="" class="font-weight-bold">
          <span _ngcontent-dna-c199="" class="ng-star-inserted">
            <span _ngcontent-dna-c199="" translate="">
              issue_number
            </span>
            4 Näköispainos
          </span>
          <span _ngcontent-dna-c199="" class="ng-star-inserted">
            6.12.1939
          </span>
        </span>
      </body>
    </html>
    ''',
    '''
    <html>
      <body>
        <span _ngcontent-sut-c199="" class="font-weight-bold">
          <span _ngcontent-sut-c199="" class="ng-star-inserted">
            <span _ngcontent-sut-c199="" translate="">
              issue_number
            </span>
            8
          </span>
          <span _ngcontent-sut-c199="" class="ng-star-inserted">
            1998
          </span>
        </span>
      </body>
    </html>
    ''',
    '''
    <html>
      <body>
        <span _ngcontent-dgu-c199="" class="font-weight-bold">
          <span _ngcontent-dgu-c199="" class="ng-star-inserted">
            1905
          </span>
        </span>
      </body>
    </html>
    ''',
    '<html><body><span class="font-weight-bold"></span></body></html>',
    '' # empty str
]

printing with

for h in htmls: print(bs2customList(BeautifulSoup(h, 'lxml')))

gives the same output [with both versions]:

['issue_number', '4', 'Näköispainos', '6.12.1939']
['issue_number', '8', None, '1998']
[None, None, None, '1905']
[None, None, None, None]
[None, None, None, None]

(The last 2 tests are with an empty [textless] html and an empty string.)

Solution 2 :

Try it like below:

from bs4 import BeautifulSoup

pages = [
'''
<html>
 <body>
  <span _ngcontent-dna-c199="" class="font-weight-bold">
   <span _ngcontent-dna-c199="" class="ng-star-inserted">
    <span _ngcontent-dna-c199="" translate="">
     issue_number
    </span>
    4 Näköispainos
   </span>
   <span _ngcontent-dna-c199="" class="ng-star-inserted">
    6.12.1939
   </span>
  </span>
 </body>
</html>
''',
'''
<html>
 <body>
  <span _ngcontent-sut-c199="" class="font-weight-bold">
   <span _ngcontent-sut-c199="" class="ng-star-inserted">
    <span _ngcontent-sut-c199="" translate="">
     issue_number
    </span>
    8
   </span>
   <span _ngcontent-sut-c199="" class="ng-star-inserted">
    1998
   </span>
  </span>
 </body>
</html>
''',
'''
<html>
 <body>
  <span _ngcontent-dgu-c199="" class="font-weight-bold">
   <span _ngcontent-dgu-c199="" class="ng-star-inserted">
    1905
   </span>
  </span>
 </body>
</html>
''' ]
i = -1
my_dct = {}
my_list = []
for page in pages:
    i += 1
    soup = BeautifulSoup(page, "lxml") # html_1, html_2, html_3
    spans = soup.find_all("span", attrs= {"class": "ng-star-inserted"})
    sp_0 = spans[0].span
    if sp_0:
        txt = sp_0.text.replace('n', '').strip()
        if txt not in my_list:
            my_list.append(txt)
    else:
        my_list.append('None')
        
    sp_1 = spans[0]
    if sp_1 and len(spans) > 1:
        txt_1 = sp_1.text.replace('n', '').replace(txt, '').strip()
        my_list.append(txt_1.split()[0])
        if len(txt_1.split()) > 1:
            my_list.append(txt_1.split()[1])
        else:
            my_list.append('None')
    else:
        my_list.append('None')
        
    if len(spans) > 1:
        txt_2 = spans[1].text.replace('n', '').strip()
        my_list.append(txt_2)
    else:
        my_list.append('None')
        txt_2 = spans[0].text.replace('n', '').strip()
        my_list.append(txt_2)
        
    my_dct['html_' + str(i + 1)] = my_list
    my_list = []
print(my_dct)

'''  R e s u l t :
{
  'html_1': ['issue_number', '4', 'Näköispainos', '6.12.1939'], 
  'html_2': ['issue_number', '8', 'None', '1998'], 
  'html_3': ['None', 'None', 'None', '1905']
}
'''

Regards…

Problem :

I have 3 different type ofhtml snippets which are part of a bigger part as follows:

<html>
 <body>
  <span _ngcontent-dna-c199="" class="font-weight-bold">
   <span _ngcontent-dna-c199="" class="ng-star-inserted">
    <span _ngcontent-dna-c199="" translate="">
     issue_number
    </span>
    4 Näköispainos
   </span>
   <span _ngcontent-dna-c199="" class="ng-star-inserted">
    6.12.1939
   </span>
  </span>
 </body>
</html>

and

<html>
 <body>
  <span _ngcontent-sut-c199="" class="font-weight-bold">
   <span _ngcontent-sut-c199="" class="ng-star-inserted">
    <span _ngcontent-sut-c199="" translate="">
     issue_number
    </span>
    8
   </span>
   <span _ngcontent-sut-c199="" class="ng-star-inserted">
    1998
   </span>
  </span>
 </body>
</html>

and

<html>
 <body>
  <span _ngcontent-dgu-c199="" class="font-weight-bold">
   <span _ngcontent-dgu-c199="" class="ng-star-inserted">
    1905
   </span>
  </span>
 </body>
</html>

Given the following code:

from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "lxml") # html_1, html_2, html_3
res = soup.find("span", class_="font-weight-bold")
print(res.text.split())

I get the following results:

['issue_number', '4', 'Näköispainos', '6.12.1939'] # html_1
['issue_number', '8', '1998']                      # html_2
['1905']                                           # html_3

However, my desired custom-made list should have 4 elements and looks like this:

desired_list = ["issue_number", "number", "extension", "date"]

so if there is no info available in html snippet, I’d like to get None or simply "-" in that specific element of my desired custom list as follows:

['issue_number', '4', 'Näköispainos', '6.12.1939'] # html_1
['issue_number', '8', None, '1998']                # html_2
[None, None, None, '1905']                         # html_3

Is there anyway to manipulate the result list to obtain the desired list using soup.find()?

Comments

Comment posted by furas

you should search every element separatelly and then you have better control because when it can’t find element then you can put

Comment posted by Farid Alijani

I get this error running the code, both def functions:

Comment posted by Driftr95

@FaridAlijani sorry, I didn’t anticipate that error because I’d never had issues with using

Comment posted by stackoverflow.com/questions/74317095/…

Dude, Thanks I appreciate your time for answering this! I actually had different

By