aboutsummaryrefslogtreecommitdiff
path: root/data-extractor/extract.py
diff options
context:
space:
mode:
Diffstat (limited to 'data-extractor/extract.py')
-rw-r--r--data-extractor/extract.py58
1 files changed, 58 insertions, 0 deletions
diff --git a/data-extractor/extract.py b/data-extractor/extract.py
new file mode 100644
index 0000000..0e9702e
--- /dev/null
+++ b/data-extractor/extract.py
@@ -0,0 +1,58 @@
+# Improvement: fix accents
+
+import json
+import urllib.request
+import time
+from bs4 import BeautifulSoup
+
+links = []
+with urllib.request.urlopen('https://www.lanutrition.fr/les-aliments-a-la-loupe') as fp:
+ content = fp.read()
+ soup = BeautifulSoup(content, 'html.parser')
+ for link in soup.find(id='nutriments_by_alpha').find_all('a'):
+ href = link.get('href')
+ links.append(f'https://www.lanutrition.fr{href}')
+
+def extractCarbohydrates(soup):
+ try:
+ return float(soup
+ .find('div', {"name": 'glucides'})
+ .find(class_='value-weight')
+ .get_text()
+ .split(' ')[0])
+ except:
+ return None
+
+def extractGlycemicIndex(soup):
+ try:
+ n = int(soup
+ .find(class_='aliment-poids')
+ .find(class_='nombre')
+ .get_text())
+ if n > 0:
+ return n
+ except:
+ return None
+
+aliments = []
+for link in links:
+ print(link)
+ time.sleep(0.5)
+ with urllib.request.urlopen(link) as fp:
+ content = fp.read()
+ soup = BeautifulSoup(content, 'html.parser')
+ name = soup.find('h1', {"id": 'page-title'}).get_text()
+ carbohydrates = extractCarbohydrates(soup)
+ glycemicIndex = extractGlycemicIndex(soup)
+ if name is not None and carbohydrates is not None and glycemicIndex is not None:
+ print('+')
+ aliments.append({
+ "name": name,
+ "glycemicIndex": glycemicIndex,
+ "carbohydrates": carbohydrates
+ })
+ else:
+ print('-')
+
+with open('export.json', 'w') as f:
+ f.write(json.dumps(aliments))