diff options
author | Joris | 2016-07-14 11:57:12 +0000 |
---|---|---|
committer | Joris | 2016-07-14 12:00:05 +0000 |
commit | 69e69017b75d1cdaa1fd2aef2818de5111b29735 (patch) | |
tree | 99dba8f67dc1c55b2cc22f33f81c59c7355b337b /src/Parser | |
parent | 04f9a66c66ca137d9fee6ccca228c41fec960fe0 (diff) |
Update code and fix parsers
Diffstat (limited to 'src/Parser')
-rw-r--r-- | src/Parser/Detail.hs | 12 | ||||
-rw-r--r-- | src/Parser/Resume.hs | 20 | ||||
-rw-r--r-- | src/Parser/Utils.hs | 14 |
3 files changed, 28 insertions, 18 deletions
diff --git a/src/Parser/Detail.hs b/src/Parser/Detail.hs index 3f424e9..2418d07 100644 --- a/src/Parser/Detail.hs +++ b/src/Parser/Detail.hs @@ -1,5 +1,5 @@ module Parser.Detail - ( parseDetail + ( parse ) where import Data.Text (Text) @@ -16,18 +16,18 @@ import Model.Detail import Parser.Utils -parseDetail :: Text -> Detail -parseDetail page = +parse :: Text -> Detail +parse page = let tags = parseTags page in Detail { description = parseDescription tags - , images = getTagAttributes "<meta itemprop=image>" (T.pack "content") tags + , images = map (\url -> T.concat [T.pack "https:", url]) $ getTagAttributes "<meta itemprop=image>" (T.pack "content") tags , properties = parseProperties tags } parseDescription :: [Tag Text] -> Maybe Text parseDescription tags = - let descriptionTags = getTagsBetween "<div itemprop=description>" "</div>" tags + let descriptionTags = getTagsBetween "<p itemprop=description>" "</p>" tags in if null descriptionTags then Nothing @@ -37,7 +37,7 @@ parseDescription tags = parseProperties :: [Tag Text] -> Map Text Text parseProperties tags = - let mbUtagData = getTagTextAfter "<script>" . getTagsAfter "<body>" $ tags + let mbUtagData = getTagTextAfter "<script>" . getTagsAfter "</script>" . getTagsAfter "<body>" $ tags in fromMaybe M.empty (fmap parseUtagData mbUtagData) parseUtagData :: Text -> Map Text Text diff --git a/src/Parser/Resume.hs b/src/Parser/Resume.hs index 76faca4..f300ec3 100644 --- a/src/Parser/Resume.hs +++ b/src/Parser/Resume.hs @@ -1,5 +1,5 @@ module Parser.Resume - ( parseResumes + ( parse ) where import Data.Maybe (catMaybes) @@ -8,22 +8,22 @@ import qualified Data.Text as T import Text.HTML.TagSoup -import Model.Resume +import Model.Resume (Resume(Resume)) import Parser.Utils -parseResumes :: Text -> [Resume] -parseResumes page = - case sections (~== "<div class=list-lbc>") (parseTags page) of +parse :: Text -> [Resume] +parse page = + case dropWhile (not . hasClass (T.pack "section") (T.pack "tabsContent")) (parseTags page) of [] -> [] - sectionTags : _ -> - let lbcTags = takeWhile (~/= "<div id=alertesCartouche>") sectionTags + sectionTags -> + let lbcTags = takeWhile (not . hasClass (T.pack "div") (T.pack "information-immo")) sectionTags in catMaybes . fmap parseResume $ partitions (~== "<a>") lbcTags parseResume :: [Tag Text] -> Maybe Resume parseResume item = do - name <- getTagTextAfter "<h2 class=title>" item - let price = getTagTextAfter "<div class=price>" item + name <- getTagTextAfter "<h2 class=item_title>" item + let price = getTagTextAfter "<h3 class=item_price>" item url <- getTagAttribute "<a>" (T.pack "href") item - return Resume { name = name, price = price, url = url } + return (Resume name price (T.concat [T.pack "https:", url])) diff --git a/src/Parser/Utils.hs b/src/Parser/Utils.hs index d72a1ce..98694bb 100644 --- a/src/Parser/Utils.hs +++ b/src/Parser/Utils.hs @@ -5,14 +5,16 @@ module Parser.Utils , getTagAttributes , getTagAttribute , getTagTextAfter + , hasClass ) where import Data.List (find, findIndex) -import Data.Maybe (listToMaybe, catMaybes) +import Data.Maybe (listToMaybe, catMaybes, isJust) import Data.Text (Text) import qualified Data.Text as T import Text.HTML.TagSoup +import Text.HTML.TagSoup.Match (tagOpen) getTagsBefore :: String -> [Tag Text] -> [Tag Text] getTagsBefore selector = takeWhile (~/= selector) @@ -43,7 +45,15 @@ getTagTextAfter selector tags = maybeTagAttribute :: Text -> Tag Text -> Maybe Text maybeTagAttribute name (TagOpen _ xs) = fmap snd . find (\(x, _) -> x == name) $ xs -maybeTagAttribute attribute _ = Nothing +maybeTagAttribute _ _ = Nothing safeGetAt :: Int -> [a] -> Maybe a safeGetAt index = listToMaybe . drop index + +hasClass :: Text -> Text -> Tag Text -> Bool +hasClass selector className = + tagOpen ((==) selector) (isJust . find matchClass) + where matchClass (name, values) = + ( name == (T.pack "class") + && (isJust . find ((==) className) . T.words $ values) + ) |