aboutsummaryrefslogtreecommitdiff
path: root/src/lib/haskell
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/haskell')
-rw-r--r--src/lib/haskell/FetchAd.hs20
-rw-r--r--src/lib/haskell/Parser/LeboncoinParser.hs17
-rw-r--r--src/lib/haskell/Utils/HTTP.hs52
3 files changed, 60 insertions, 29 deletions
diff --git a/src/lib/haskell/FetchAd.hs b/src/lib/haskell/FetchAd.hs
index a206181..1708fe4 100644
--- a/src/lib/haskell/FetchAd.hs
+++ b/src/lib/haskell/FetchAd.hs
@@ -5,7 +5,7 @@ module FetchAd
) where
import Data.Either (rights)
-import Data.Text.Encoding as T
+import Network.HTTP.Conduit (Manager)
import Model.Ad (Ad)
import Model.URL (URL)
@@ -14,23 +14,23 @@ import qualified Parser.OuestFranceParser as OuestFranceParser
import qualified Parser.SeLogerParser as SeLogerParser
import qualified Utils.HTTP as HTTP
-leboncoin :: [URL] -> IO [Ad]
-leboncoin urls =
+leboncoin :: Manager -> [URL] -> IO [Ad]
+leboncoin manager urls =
fmap (concat . map LeboncoinParser.parse . rights)
. sequence
- . map (HTTP.get T.decodeLatin1)
+ . map (HTTP.get manager)
$ urls
-ouestFrance :: [URL] -> IO [Ad]
-ouestFrance urls =
+ouestFrance :: Manager -> [URL] -> IO [Ad]
+ouestFrance manager urls =
fmap (concat . map OuestFranceParser.parse . rights)
. sequence
- . map (HTTP.get T.decodeUtf8)
+ . map (HTTP.get manager)
$ urls
-seLoger :: [URL] -> IO [Ad]
-seLoger urls =
+seLoger :: Manager -> [URL] -> IO [Ad]
+seLoger manager urls =
fmap (concat . map SeLogerParser.parse . rights)
. sequence
- . map (HTTP.get T.decodeUtf8)
+ . map (HTTP.get manager)
$ urls
diff --git a/src/lib/haskell/Parser/LeboncoinParser.hs b/src/lib/haskell/Parser/LeboncoinParser.hs
index 77213cb..99d8116 100644
--- a/src/lib/haskell/Parser/LeboncoinParser.hs
+++ b/src/lib/haskell/Parser/LeboncoinParser.hs
@@ -11,14 +11,19 @@ import Model.Ad (Ad (Ad))
import Parser.Utils
parse :: Text -> [Ad]
-parse page =
- catMaybes . fmap parseAd $ partitions (~== (T.unpack "<a>")) tags
- where tags = getTagsBetween "<li itemtype=http://schema.org/Offer>" "<div class=information-immo_content>" (parseTags page)
+parse =
+ catMaybes
+ . fmap parseAd
+ . partitions (~== (T.unpack "<li>"))
+ . parseTags
parseAd :: [Tag Text] -> Maybe Ad
parseAd tags = do
- name <- getTagTextAfter "<h2 class=item_title>" tags
- location <- getTagAttribute "<meta itemprop=address>" "content" tags
- let price = getTagTextAfter "<h3 class=item_price>" tags
+ name <- getTagTextAfter "<span data-qa-id=aditem_title>" tags
+ location <- getTagTextAfter "<p data-qa-id=aditem_location>" tags
+ let price =
+ case getTagsBetween "<span itemprop=priceCurrency>" "</span>" tags of
+ [] -> Nothing
+ xs -> Just $ innerText xs
url <- getTagAttribute "<a>" "href" tags
return (Ad name location price (T.concat ["https:", url]))
diff --git a/src/lib/haskell/Utils/HTTP.hs b/src/lib/haskell/Utils/HTTP.hs
index 87635ce..9bcf5f0 100644
--- a/src/lib/haskell/Utils/HTTP.hs
+++ b/src/lib/haskell/Utils/HTTP.hs
@@ -2,21 +2,47 @@ module Utils.HTTP
( get
) where
-import Control.Exception (SomeException, try)
-import Data.ByteString (ByteString)
-import qualified Data.ByteString.Lazy as BS
-import Data.Text (Text)
-import qualified Data.Text as T
-import Network.HTTP.Conduit
+import qualified Data.ByteString.Lazy as BS
+import Data.Text (Text)
+import qualified Data.Text as T
+import Data.Text.Encoding as T
+import Data.Text.IO as T
+import Network.HTTP.Conduit (Manager)
+import qualified Network.HTTP.Conduit as H
+import qualified Network.HTTP.Simple as HS
+import qualified Network.HTTP.Types.Status as Status
import Model.URL
-get :: (ByteString -> Text) -> URL -> IO (Either Text Text)
-get decode url = mapLeft (T.pack . show) <$> (try (unsafeGetPage decode url) :: IO (Either SomeException Text))
+get :: Manager -> URL -> IO (Either Text Text)
+get manager url = do
+ request <- H.parseRequest (T.unpack url)
-unsafeGetPage :: (ByteString -> Text) -> URL -> IO Text
-unsafeGetPage decode url = (decode . BS.toStrict) <$> simpleHttp (T.unpack url)
+ response <- H.httpLbs (HS.setRequestHeaders requestHeaders request) manager
+ let body = T.decodeUtf8 . BS.toStrict . H.responseBody $ response
+ let statusCode = Status.statusCode . H.responseStatus $ response
-mapLeft :: (a -> c) -> Either a b -> Either c b
-mapLeft f (Left l) = Left (f l)
-mapLeft _ (Right r) = (Right r)
+ if statusCode >= 200 && statusCode < 300 then
+ return . Right $ body
+ else do
+ T.putStrLn . T.concat $
+ [ "Got status "
+ , T.pack . show $ statusCode
+ , " while fetching "
+ , url
+ , ":\n"
+ , body
+ ]
+ return . Left $ body
+
+ where
+ requestHeaders =
+ [ ("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0")
+ , ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
+ , ("Accept-Language", "en-US,en;fr;q=0.5")
+ , ("Accept-Encoding", "gzip, deflate, br")
+ , ("Referer", "https://duckduckgo.com/")
+ , ("DNT", "1")
+ , ("Connection", "keep-alive")
+ , ("Upgrade-Insecure-Requests", "1")
+ ]