diff options
Diffstat (limited to 'data-extractor')
-rw-r--r-- | data-extractor/extract.py | 58 |
1 files changed, 58 insertions, 0 deletions
diff --git a/data-extractor/extract.py b/data-extractor/extract.py new file mode 100644 index 0000000..0e9702e --- /dev/null +++ b/data-extractor/extract.py @@ -0,0 +1,58 @@ +# Improvement: fix accents + +import json +import urllib.request +import time +from bs4 import BeautifulSoup + +links = [] +with urllib.request.urlopen('https://www.lanutrition.fr/les-aliments-a-la-loupe') as fp: + content = fp.read() + soup = BeautifulSoup(content, 'html.parser') + for link in soup.find(id='nutriments_by_alpha').find_all('a'): + href = link.get('href') + links.append(f'https://www.lanutrition.fr{href}') + +def extractCarbohydrates(soup): + try: + return float(soup + .find('div', {"name": 'glucides'}) + .find(class_='value-weight') + .get_text() + .split(' ')[0]) + except: + return None + +def extractGlycemicIndex(soup): + try: + n = int(soup + .find(class_='aliment-poids') + .find(class_='nombre') + .get_text()) + if n > 0: + return n + except: + return None + +aliments = [] +for link in links: + print(link) + time.sleep(0.5) + with urllib.request.urlopen(link) as fp: + content = fp.read() + soup = BeautifulSoup(content, 'html.parser') + name = soup.find('h1', {"id": 'page-title'}).get_text() + carbohydrates = extractCarbohydrates(soup) + glycemicIndex = extractGlycemicIndex(soup) + if name is not None and carbohydrates is not None and glycemicIndex is not None: + print('+') + aliments.append({ + "name": name, + "glycemicIndex": glycemicIndex, + "carbohydrates": carbohydrates + }) + else: + print('-') + +with open('export.json', 'w') as f: + f.write(json.dumps(aliments)) |