summaryrefslogtreecommitdiff
authorEvan Martin <martine@danga.com>2008-10-22 03:08:10 (GMT)
committer Evan Martin <martine@danga.com>2008-10-22 03:08:57 (GMT)
commit77c0a0435342b9575c3aa0b1d76c41919d4bfc27 (patch)
tree888ba6852a12b29b8834467f2970524fc0cd64a0
parenta4c82b068567ad6384a52c86c5bee2fe5ce62684 (diff)
faster commit parser (no longer using parsec)
Diffstat
-rw-r--r--Commit.hs94
-rw-r--r--Commit_perftest.hs2
-rw-r--r--Commit_test.hs13
-rw-r--r--ObjectStore.hs2
-rw-r--r--Shared.hs6
5 files changed, 79 insertions, 38 deletions
diff --git a/Commit.hs b/Commit.hs
index 5a89b00..3854efa 100644
--- a/Commit.hs
+++ b/Commit.hs
@@ -1,8 +1,19 @@
-module Commit where
+module Commit (
+ Commit(..)
+ , parseCommit
+ -- Exposed for testing.
+ , searchBS
+) where
+
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Internal as BI
+import qualified Data.ByteString.Unsafe as BU
import Control.Monad.Error
import Text.ParserCombinators.Parsec
+import Shared
+
-- TODO:
-- Represent hashes as "Shared.Hash"es.
-- Parse author/committter properly.
@@ -20,36 +31,53 @@ emptyCommit = Commit [] [] [] [] []
type CommitParser a = GenParser Char Commit a
+bsToString = map BI.w2c . B.unpack
+
+bsBreakAround :: Char -> B.ByteString -> (B.ByteString, B.ByteString)
+bsBreakAround char str =
+ let (before, after) = B.break (== BI.c2w char) str
+ in (before, B.tail after)
+
+-- Search a ByteString for a substring, returning its offset.
+-- Warning: naive search.
+searchBS :: B.ByteString -> B.ByteString -> Maybe Int
+searchBS needle str = tryNext str where
+ firstChar = B.head needle
+ tryNext str = do
+ ofs <- B.elemIndex firstChar str
+ let substr = B.drop ofs str
+ if needle `B.isPrefixOf` substr
+ then return ofs
+ else do
+ ofs' <- tryNext (B.drop 1 substr)
+ return (ofs + 1 + ofs')
+
-- Parse a raw commit object from the datastore into a commit.
-parseCommit :: String -> Either String Commit
-parseCommit input =
- -- I'm unclear on whether the header lines are guaranteed to be in any
- -- order, so for simplicitly we allow them in any order.
- -- Parsec's permutation phrases require at most one of each instance,
- -- so we can't use them (commits can have multiple parents). Instead,
- -- we just update a Commit object's fields as we parse.
- case runParser p_commit emptyCommit "" input of
- Left error -> throwError $ "error parsing commit: " ++ show error
- Right parse -> return parse
- where
- p_commit = do many p_header; newline; p_message; getState
- p_header = p_tree <|> p_parent <|> p_author <|> p_committer
- p_keyval str rest = do
- try $ (string str >> char ' ')
- rest
- p_tree = do
- hash <- p_keyval "tree" p_eol
- updateState (\commit -> commit { commit_tree=hash })
- p_parent = do
- hash <- p_keyval "parent" p_eol
- updateState (\commit -> commit { commit_parents=(commit_parents commit)++[hash] })
- p_author = do
- person <- p_keyval "author" p_eol
- updateState (\commit -> commit { commit_author=person })
- p_committer = do
- person <- p_keyval "committer" p_eol
- updateState (\commit -> commit { commit_committer=person })
- p_eol = anyChar `manyTill` newline
- p_message = do
- message <- many anyChar
- updateState (\commit -> commit { commit_message=message })
+parseCommit :: B.ByteString -> Either String Commit
+parseCommit input = commit where
+ commit = do
+ case searchBS (makeBS "\n\n") input of
+ Nothing -> fail "couldn't parse commit"
+ Just ofs -> do
+ let headers = parseHeaders (B.take ofs input)
+ let message = bsToString $ B.drop (ofs+2) input
+ return $ applyHeaders (emptyCommit { commit_message=message }) headers
+
+-- Parse a newline-separated list of headers into key,value pairs.
+parseHeaders :: B.ByteString -> [(String, String)]
+parseHeaders str = map splitHeader headerlines where
+ headerlines = B.split (BI.c2w '\n') str
+ splitHeader header =
+ let (key, val) = bsBreakAround ' ' header
+ in (bsToString key, bsToString val)
+
+-- Apply a list of headers to a commit.
+applyHeaders :: Commit -> [(String, String)] -> Commit
+applyHeaders = foldl applyHeader where
+ applyHeader commit (key, val)
+ | key == "tree" = commit { commit_tree=val }
+ | key == "parent" = commit { commit_parents=words val }
+ | key == "author" = commit { commit_author=val }
+ | key == "committer" = commit { commit_committer=val }
+ | otherwise = commit -- XXX should we handle unparsed headers?
+
diff --git a/Commit_perftest.hs b/Commit_perftest.hs
index 6bdaa3d..5c1831e 100644
--- a/Commit_perftest.hs
+++ b/Commit_perftest.hs
@@ -6,7 +6,7 @@ import Commit
main = microbench "commit parsing" parseOneCommit where
parseOneCommit = do
- text <- readFile "testdata/commit"
+ text <- B.readFile "testdata/commit"
let Right commit = parseCommit text
unless (length (commit_parents commit) > 0) $
fail "misparse"
diff --git a/Commit_test.hs b/Commit_test.hs
index d0fa2aa..22237c2 100644
--- a/Commit_test.hs
+++ b/Commit_test.hs
@@ -1,10 +1,19 @@
+import qualified Data.ByteString as B
import Test.HUnit
import Commit
+import Shared
+
+testSearchBS :: Test
+testSearchBS = test $ do
+ let str = makeBS "foo\nbar\nbaz\n\nnewlines\n"
+ let (Just ofs) = searchBS (makeBS "\n\n") str
+ assertEqual "found double-nl"
+ (makeBS "\n\nnewlines\n") (B.drop ofs str)
testParse :: Test
testParse = test $ do
- text <- readFile "testdata/commit"
+ text <- B.readFile "testdata/commit"
let Right commit = parseCommit text
assertEqual "tree parsed"
"6cab39a126bb985be8ff6e3907f648d55c2a5c57" (commit_tree commit)
@@ -17,4 +26,4 @@ testParse = test $ do
assertEqual "message parsed"
"generate docs in makefile\n" (commit_message commit)
-main = runTestTT testParse
+main = runTestTT (test [testSearchBS, testParse])
diff --git a/ObjectStore.hs b/ObjectStore.hs
index f135083..46ef0cd 100644
--- a/ObjectStore.hs
+++ b/ObjectStore.hs
@@ -116,7 +116,7 @@ getObject hash = do
tree <- forceError (parseTree raw)
return (ObTree tree)
TypeCommit -> do
- commit <- forceError (parseCommit (bsToString raw))
+ commit <- forceError (parseCommit (strictifyBS raw))
return (ObCommit commit)
-- | @findTree hash@ fetches objects, starting at @hash@, following commits
diff --git a/Shared.hs b/Shared.hs
index 0ba7d12..a93a40f 100644
--- a/Shared.hs
+++ b/Shared.hs
@@ -8,7 +8,7 @@ module Shared (
, isHashString
, fromHex
, splitMSB
- , strictifyBS
+ , strictifyBS, makeBS
, trace
) where
@@ -82,3 +82,7 @@ splitMSB byte = (msb, bits) where
-- |Convert a ByteString.Lazy to a strict ByteString.
strictifyBS :: BL.ByteString -> B.ByteString
strictifyBS = B.concat . BL.toChunks
+
+-- | Convert a String into a strict ByteString.
+makeBS :: String -> B.ByteString
+makeBS = B.pack . map (fromIntegral . fromEnum)