diff options
author | Joris | 2018-06-17 23:24:47 +0200 |
---|---|---|
committer | Joris | 2018-06-18 11:13:55 +0200 |
commit | 0a4d3c8f12dc5797a919a00b6bcaf759947687cc (patch) | |
tree | bcb89781e22c2314bf0c064ebb37cb7f8a362f5c /src/Parser | |
parent | e2a5c7c5c596d057b6fa9c08a8204ce1429cfdc4 (diff) |
Add ouest france parser
Diffstat (limited to 'src/Parser')
-rw-r--r-- | src/Parser/Detail.hs | 63 | ||||
-rw-r--r-- | src/Parser/Resume.hs | 31 | ||||
-rw-r--r-- | src/Parser/Utils.hs | 59 |
3 files changed, 0 insertions, 153 deletions
diff --git a/src/Parser/Detail.hs b/src/Parser/Detail.hs deleted file mode 100644 index 2418d07..0000000 --- a/src/Parser/Detail.hs +++ /dev/null @@ -1,63 +0,0 @@ -module Parser.Detail - ( parse - ) where - -import Data.Text (Text) -import qualified Data.Text as T - -import Data.Map (Map) -import qualified Data.Map as M - -import Data.Maybe (catMaybes, fromMaybe) - -import Text.HTML.TagSoup - -import Model.Detail - -import Parser.Utils - -parse :: Text -> Detail -parse page = - let tags = parseTags page - in Detail - { description = parseDescription tags - , images = map (\url -> T.concat [T.pack "https:", url]) $ getTagAttributes "<meta itemprop=image>" (T.pack "content") tags - , properties = parseProperties tags - } - -parseDescription :: [Tag Text] -> Maybe Text -parseDescription tags = - let descriptionTags = getTagsBetween "<p itemprop=description>" "</p>" tags - in if null descriptionTags - then - Nothing - else - let replaceBr = map (\tag -> if tag ~== "<br>" then TagText (T.pack "\n") else tag) - in Just . T.strip . renderTags . replaceBr $ descriptionTags - -parseProperties :: [Tag Text] -> Map Text Text -parseProperties tags = - let mbUtagData = getTagTextAfter "<script>" . getTagsAfter "</script>" . getTagsAfter "<body>" $ tags - in fromMaybe M.empty (fmap parseUtagData mbUtagData) - -parseUtagData :: Text -> Map Text Text -parseUtagData = - M.fromList - . catMaybes - . fmap parseUtag - . T.splitOn (T.pack ",") - . T.takeWhile (/= '}') - . T.drop 1 - . T.dropWhile (/= '{') - -parseUtag :: Text -> Maybe (Text, Text) -parseUtag utag = - case T.splitOn (T.pack ":") utag of - [x, y] -> Just (T.strip x, removeQuotes y) - _ -> Nothing - -removeQuotes :: Text -> Text -removeQuotes = - T.takeWhile (/= '\"') - . T.dropWhile (== '\"') - . T.strip diff --git a/src/Parser/Resume.hs b/src/Parser/Resume.hs deleted file mode 100644 index 8940be7..0000000 --- a/src/Parser/Resume.hs +++ /dev/null @@ -1,31 +0,0 @@ -module Parser.Resume - ( parse - ) where - -import Data.Maybe (catMaybes, isJust) -import Data.List (find) -import Data.Text (Text) -import qualified Data.Text as T - -import Text.HTML.TagSoup - -import Model.Resume (Resume(Resume)) - -import Parser.Utils - -parse :: Text -> [Resume] -parse page = - case dropWhile (not . hasClass (T.pack "section") (T.pack "tabsContent")) (parseTags page) of - [] -> - [] - sectionTags -> - let lbcTags = takeWhile (not . hasClass (T.pack "div") (T.pack "information-immo")) sectionTags - in catMaybes . fmap parseResume $ partitions (~== "<a>") lbcTags - -parseResume :: [Tag Text] -> Maybe Resume -parseResume item = do - name <- getTagTextAfter "<h2 class=item_title>" item - let price = getTagTextAfter "<h3 class=item_price>" item - url <- getTagAttribute "<a>" (T.pack "href") item - let isPro = isJust . find (~== "<span class=ispro>") $ item - return (Resume name price (T.concat [T.pack "https:", url]) isPro) diff --git a/src/Parser/Utils.hs b/src/Parser/Utils.hs deleted file mode 100644 index 98694bb..0000000 --- a/src/Parser/Utils.hs +++ /dev/null @@ -1,59 +0,0 @@ -module Parser.Utils - ( getTagsBefore - , getTagsAfter - , getTagsBetween - , getTagAttributes - , getTagAttribute - , getTagTextAfter - , hasClass - ) where - -import Data.List (find, findIndex) -import Data.Maybe (listToMaybe, catMaybes, isJust) -import Data.Text (Text) -import qualified Data.Text as T - -import Text.HTML.TagSoup -import Text.HTML.TagSoup.Match (tagOpen) - -getTagsBefore :: String -> [Tag Text] -> [Tag Text] -getTagsBefore selector = takeWhile (~/= selector) - -getTagsAfter :: String -> [Tag Text] -> [Tag Text] -getTagsAfter selector = drop 1 . dropWhile (~/= selector) - -getTagsBetween :: String -> String -> [Tag Text] -> [Tag Text] -getTagsBetween begin end = getTagsBefore end . getTagsAfter begin - -getTagAttributes :: String -> Text -> [Tag Text] -> [Text] -getTagAttributes selector attribute = - catMaybes - . fmap (maybeTagAttribute attribute) - . filter (~== selector) - -getTagAttribute :: String -> Text -> [Tag Text] -> Maybe Text -getTagAttribute selector attribute = - listToMaybe - . getTagAttributes selector attribute - -getTagTextAfter :: String -> [Tag Text] -> Maybe Text -getTagTextAfter selector tags = - case findIndex (~== selector) tags of - Just index -> fmap T.strip $ safeGetAt (index + 1) tags >>= maybeTagText - Nothing -> Nothing - -maybeTagAttribute :: Text -> Tag Text -> Maybe Text -maybeTagAttribute name (TagOpen _ xs) = - fmap snd . find (\(x, _) -> x == name) $ xs -maybeTagAttribute _ _ = Nothing - -safeGetAt :: Int -> [a] -> Maybe a -safeGetAt index = listToMaybe . drop index - -hasClass :: Text -> Text -> Tag Text -> Bool -hasClass selector className = - tagOpen ((==) selector) (isJust . find matchClass) - where matchClass (name, values) = - ( name == (T.pack "class") - && (isJust . find ((==) className) . T.words $ values) - ) |