1 files changed, 58 insertions, 0 deletions
diff --git a/data-extractor/extract.py b/data-extractor/extract.py
new file mode 100644
index 0000000..0e9702e
--- /dev/null
+++ b/data-extractor/extract.py
@@ -0,0 +1,58 @@
+# Improvement: fix accents
+
+import json
+import urllib.request
+import time
+from bs4 import BeautifulSoup
+
+links = []
+with urllib.request.urlopen('https://www.lanutrition.fr/les-aliments-a-la-loupe') as fp:
+    content = fp.read()
+    soup = BeautifulSoup(content, 'html.parser')
+    for link in soup.find(id='nutriments_by_alpha').find_all('a'):
+        href = link.get('href')
+        links.append(f'https://www.lanutrition.fr{href}')
+
+def extractCarbohydrates(soup):
+    try:
+        return float(soup
+            .find('div', {"name": 'glucides'})
+            .find(class_='value-weight')
+            .get_text()
+            .split(' ')[0])
+    except:
+        return None
+
+def extractGlycemicIndex(soup):
+    try:
+        n = int(soup
+            .find(class_='aliment-poids')
+            .find(class_='nombre')
+            .get_text())
+        if n > 0:
+            return n
+    except:
+        return None
+
+aliments = []
+for link in links:
+    print(link)
+    time.sleep(0.5)
+    with urllib.request.urlopen(link) as fp:
+        content = fp.read()
+        soup = BeautifulSoup(content, 'html.parser')
+        name = soup.find('h1', {"id": 'page-title'}).get_text()
+        carbohydrates = extractCarbohydrates(soup)
+        glycemicIndex = extractGlycemicIndex(soup)
+        if name is not None and carbohydrates is not None and glycemicIndex is not None:
+            print('+')
+            aliments.append({
+                "name": name,
+                "glycemicIndex": glycemicIndex,
+                "carbohydrates": carbohydrates
+            })
+        else:
+            print('-')
+
+with open('export.json', 'w') as f:
+    f.write(json.dumps(aliments))