From 5cedcecd6ae31e2485dcab2ddd74c74a4779545d Mon Sep 17 00:00:00 2001 From: Joris Date: Tue, 3 Sep 2019 21:01:53 +0200 Subject: Make LBC to work Use request headers to simulate a normal browser --- src/lib/haskell/FetchAd.hs | 20 ++++++------ src/lib/haskell/Parser/LeboncoinParser.hs | 17 ++++++---- src/lib/haskell/Utils/HTTP.hs | 52 +++++++++++++++++++++++-------- 3 files changed, 60 insertions(+), 29 deletions(-) (limited to 'src/lib/haskell') diff --git a/src/lib/haskell/FetchAd.hs b/src/lib/haskell/FetchAd.hs index a206181..1708fe4 100644 --- a/src/lib/haskell/FetchAd.hs +++ b/src/lib/haskell/FetchAd.hs @@ -5,7 +5,7 @@ module FetchAd ) where import Data.Either (rights) -import Data.Text.Encoding as T +import Network.HTTP.Conduit (Manager) import Model.Ad (Ad) import Model.URL (URL) @@ -14,23 +14,23 @@ import qualified Parser.OuestFranceParser as OuestFranceParser import qualified Parser.SeLogerParser as SeLogerParser import qualified Utils.HTTP as HTTP -leboncoin :: [URL] -> IO [Ad] -leboncoin urls = +leboncoin :: Manager -> [URL] -> IO [Ad] +leboncoin manager urls = fmap (concat . map LeboncoinParser.parse . rights) . sequence - . map (HTTP.get T.decodeLatin1) + . map (HTTP.get manager) $ urls -ouestFrance :: [URL] -> IO [Ad] -ouestFrance urls = +ouestFrance :: Manager -> [URL] -> IO [Ad] +ouestFrance manager urls = fmap (concat . map OuestFranceParser.parse . rights) . sequence - . map (HTTP.get T.decodeUtf8) + . map (HTTP.get manager) $ urls -seLoger :: [URL] -> IO [Ad] -seLoger urls = +seLoger :: Manager -> [URL] -> IO [Ad] +seLoger manager urls = fmap (concat . map SeLogerParser.parse . rights) . sequence - . map (HTTP.get T.decodeUtf8) + . map (HTTP.get manager) $ urls diff --git a/src/lib/haskell/Parser/LeboncoinParser.hs b/src/lib/haskell/Parser/LeboncoinParser.hs index 77213cb..99d8116 100644 --- a/src/lib/haskell/Parser/LeboncoinParser.hs +++ b/src/lib/haskell/Parser/LeboncoinParser.hs @@ -11,14 +11,19 @@ import Model.Ad (Ad (Ad)) import Parser.Utils parse :: Text -> [Ad] -parse page = - catMaybes . fmap parseAd $ partitions (~== (T.unpack "")) tags - where tags = getTagsBetween "
  • " "
    " (parseTags page) +parse = + catMaybes + . fmap parseAd + . partitions (~== (T.unpack "
  • ")) + . parseTags parseAd :: [Tag Text] -> Maybe Ad parseAd tags = do - name <- getTagTextAfter "

    " tags - location <- getTagAttribute "" "content" tags - let price = getTagTextAfter "

    " tags + name <- getTagTextAfter "" tags + location <- getTagTextAfter "

    " tags + let price = + case getTagsBetween "" "" tags of + [] -> Nothing + xs -> Just $ innerText xs url <- getTagAttribute "" "href" tags return (Ad name location price (T.concat ["https:", url])) diff --git a/src/lib/haskell/Utils/HTTP.hs b/src/lib/haskell/Utils/HTTP.hs index 87635ce..9bcf5f0 100644 --- a/src/lib/haskell/Utils/HTTP.hs +++ b/src/lib/haskell/Utils/HTTP.hs @@ -2,21 +2,47 @@ module Utils.HTTP ( get ) where -import Control.Exception (SomeException, try) -import Data.ByteString (ByteString) -import qualified Data.ByteString.Lazy as BS -import Data.Text (Text) -import qualified Data.Text as T -import Network.HTTP.Conduit +import qualified Data.ByteString.Lazy as BS +import Data.Text (Text) +import qualified Data.Text as T +import Data.Text.Encoding as T +import Data.Text.IO as T +import Network.HTTP.Conduit (Manager) +import qualified Network.HTTP.Conduit as H +import qualified Network.HTTP.Simple as HS +import qualified Network.HTTP.Types.Status as Status import Model.URL -get :: (ByteString -> Text) -> URL -> IO (Either Text Text) -get decode url = mapLeft (T.pack . show) <$> (try (unsafeGetPage decode url) :: IO (Either SomeException Text)) +get :: Manager -> URL -> IO (Either Text Text) +get manager url = do + request <- H.parseRequest (T.unpack url) -unsafeGetPage :: (ByteString -> Text) -> URL -> IO Text -unsafeGetPage decode url = (decode . BS.toStrict) <$> simpleHttp (T.unpack url) + response <- H.httpLbs (HS.setRequestHeaders requestHeaders request) manager + let body = T.decodeUtf8 . BS.toStrict . H.responseBody $ response + let statusCode = Status.statusCode . H.responseStatus $ response -mapLeft :: (a -> c) -> Either a b -> Either c b -mapLeft f (Left l) = Left (f l) -mapLeft _ (Right r) = (Right r) + if statusCode >= 200 && statusCode < 300 then + return . Right $ body + else do + T.putStrLn . T.concat $ + [ "Got status " + , T.pack . show $ statusCode + , " while fetching " + , url + , ":\n" + , body + ] + return . Left $ body + + where + requestHeaders = + [ ("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0") + , ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + , ("Accept-Language", "en-US,en;fr;q=0.5") + , ("Accept-Encoding", "gzip, deflate, br") + , ("Referer", "https://duckduckgo.com/") + , ("DNT", "1") + , ("Connection", "keep-alive") + , ("Upgrade-Insecure-Requests", "1") + ] -- cgit v1.2.3