| author | Evan Martin <martine@danga.com> | 2008-10-22 03:08:10 (GMT) |
|---|---|---|
| committer | Evan Martin <martine@danga.com> | 2008-10-22 03:08:57 (GMT) |
| commit | 77c0a0435342b9575c3aa0b1d76c41919d4bfc27 (patch) | |
| tree | 888ba6852a12b29b8834467f2970524fc0cd64a0 | |
| parent | a4c82b068567ad6384a52c86c5bee2fe5ce62684 (diff) | |
faster commit parser (no longer using parsec)
| -rw-r--r-- | Commit.hs | 94 | ||||
| -rw-r--r-- | Commit_perftest.hs | 2 | ||||
| -rw-r--r-- | Commit_test.hs | 13 | ||||
| -rw-r--r-- | ObjectStore.hs | 2 | ||||
| -rw-r--r-- | Shared.hs | 6 |
5 files changed, 79 insertions, 38 deletions
@@ -1,8 +1,19 @@ -module Commit where +module Commit ( + Commit(..) + , parseCommit + -- Exposed for testing. + , searchBS +) where + +import qualified Data.ByteString as B +import qualified Data.ByteString.Internal as BI +import qualified Data.ByteString.Unsafe as BU import Control.Monad.Error import Text.ParserCombinators.Parsec +import Shared + -- TODO: -- Represent hashes as "Shared.Hash"es. -- Parse author/committter properly. @@ -20,36 +31,53 @@ emptyCommit = Commit [] [] [] [] [] type CommitParser a = GenParser Char Commit a +bsToString = map BI.w2c . B.unpack + +bsBreakAround :: Char -> B.ByteString -> (B.ByteString, B.ByteString) +bsBreakAround char str = + let (before, after) = B.break (== BI.c2w char) str + in (before, B.tail after) + +-- Search a ByteString for a substring, returning its offset. +-- Warning: naive search. +searchBS :: B.ByteString -> B.ByteString -> Maybe Int +searchBS needle str = tryNext str where + firstChar = B.head needle + tryNext str = do + ofs <- B.elemIndex firstChar str + let substr = B.drop ofs str + if needle `B.isPrefixOf` substr + then return ofs + else do + ofs' <- tryNext (B.drop 1 substr) + return (ofs + 1 + ofs') + -- Parse a raw commit object from the datastore into a commit. -parseCommit :: String -> Either String Commit -parseCommit input = - -- I'm unclear on whether the header lines are guaranteed to be in any - -- order, so for simplicitly we allow them in any order. - -- Parsec's permutation phrases require at most one of each instance, - -- so we can't use them (commits can have multiple parents). Instead, - -- we just update a Commit object's fields as we parse. - case runParser p_commit emptyCommit "" input of - Left error -> throwError $ "error parsing commit: " ++ show error - Right parse -> return parse - where - p_commit = do many p_header; newline; p_message; getState - p_header = p_tree <|> p_parent <|> p_author <|> p_committer - p_keyval str rest = do - try $ (string str >> char ' ') - rest - p_tree = do - hash <- p_keyval "tree" p_eol - updateState (\commit -> commit { commit_tree=hash }) - p_parent = do - hash <- p_keyval "parent" p_eol - updateState (\commit -> commit { commit_parents=(commit_parents commit)++[hash] }) - p_author = do - person <- p_keyval "author" p_eol - updateState (\commit -> commit { commit_author=person }) - p_committer = do - person <- p_keyval "committer" p_eol - updateState (\commit -> commit { commit_committer=person }) - p_eol = anyChar `manyTill` newline - p_message = do - message <- many anyChar - updateState (\commit -> commit { commit_message=message }) +parseCommit :: B.ByteString -> Either String Commit +parseCommit input = commit where + commit = do + case searchBS (makeBS "\n\n") input of + Nothing -> fail "couldn't parse commit" + Just ofs -> do + let headers = parseHeaders (B.take ofs input) + let message = bsToString $ B.drop (ofs+2) input + return $ applyHeaders (emptyCommit { commit_message=message }) headers + +-- Parse a newline-separated list of headers into key,value pairs. +parseHeaders :: B.ByteString -> [(String, String)] +parseHeaders str = map splitHeader headerlines where + headerlines = B.split (BI.c2w '\n') str + splitHeader header = + let (key, val) = bsBreakAround ' ' header + in (bsToString key, bsToString val) + +-- Apply a list of headers to a commit. +applyHeaders :: Commit -> [(String, String)] -> Commit +applyHeaders = foldl applyHeader where + applyHeader commit (key, val) + | key == "tree" = commit { commit_tree=val } + | key == "parent" = commit { commit_parents=words val } + | key == "author" = commit { commit_author=val } + | key == "committer" = commit { commit_committer=val } + | otherwise = commit -- XXX should we handle unparsed headers? + diff --git a/Commit_perftest.hs b/Commit_perftest.hs index 6bdaa3d..5c1831e 100644 --- a/Commit_perftest.hs +++ b/Commit_perftest.hs @@ -6,7 +6,7 @@ import Commit main = microbench "commit parsing" parseOneCommit where parseOneCommit = do - text <- readFile "testdata/commit" + text <- B.readFile "testdata/commit" let Right commit = parseCommit text unless (length (commit_parents commit) > 0) $ fail "misparse" diff --git a/Commit_test.hs b/Commit_test.hs index d0fa2aa..22237c2 100644 --- a/Commit_test.hs +++ b/Commit_test.hs @@ -1,10 +1,19 @@ +import qualified Data.ByteString as B import Test.HUnit import Commit +import Shared + +testSearchBS :: Test +testSearchBS = test $ do + let str = makeBS "foo\nbar\nbaz\n\nnewlines\n" + let (Just ofs) = searchBS (makeBS "\n\n") str + assertEqual "found double-nl" + (makeBS "\n\nnewlines\n") (B.drop ofs str) testParse :: Test testParse = test $ do - text <- readFile "testdata/commit" + text <- B.readFile "testdata/commit" let Right commit = parseCommit text assertEqual "tree parsed" "6cab39a126bb985be8ff6e3907f648d55c2a5c57" (commit_tree commit) @@ -17,4 +26,4 @@ testParse = test $ do assertEqual "message parsed" "generate docs in makefile\n" (commit_message commit) -main = runTestTT testParse +main = runTestTT (test [testSearchBS, testParse]) diff --git a/ObjectStore.hs b/ObjectStore.hs index f135083..46ef0cd 100644 --- a/ObjectStore.hs +++ b/ObjectStore.hs @@ -116,7 +116,7 @@ getObject hash = do tree <- forceError (parseTree raw) return (ObTree tree) TypeCommit -> do - commit <- forceError (parseCommit (bsToString raw)) + commit <- forceError (parseCommit (strictifyBS raw)) return (ObCommit commit) -- | @findTree hash@ fetches objects, starting at @hash@, following commits @@ -8,7 +8,7 @@ module Shared ( , isHashString , fromHex , splitMSB - , strictifyBS + , strictifyBS, makeBS , trace ) where @@ -82,3 +82,7 @@ splitMSB byte = (msb, bits) where -- |Convert a ByteString.Lazy to a strict ByteString. strictifyBS :: BL.ByteString -> B.ByteString strictifyBS = B.concat . BL.toChunks + +-- | Convert a String into a strict ByteString. +makeBS :: String -> B.ByteString +makeBS = B.pack . map (fromIntegral . fromEnum) |
