-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[#64] Implement copy paste protection #245
base: YuriRomanowski/#64-refactor-markdown-scanner
Are you sure you want to change the base?
Changes from 1 commit
44f21e5
d497a1b
2e0d0f6
a844583
fd71f97
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,20 +26,33 @@ module Xrefcheck.Verify | |
, verifyReference | ||
, checkExternalResource | ||
|
||
-- * Copypaste check | ||
, checkCopyPaste | ||
, CopyPasteCheckResult (..) | ||
|
||
-- * URI parsing | ||
, parseUri | ||
|
||
-- * Reporting errors | ||
, reportVerifyErrs | ||
, reportCopyPasteErrors | ||
) where | ||
|
||
import Universum | ||
|
||
import Control.Concurrent.Async (Async, async, cancel, poll, wait, withAsync) | ||
import Control.Exception (AsyncException (..), throwIO) | ||
import Control.Exception.Safe (handleAsync, handleJust) | ||
import Control.Monad.Except (MonadError (..)) | ||
import Data.Bits (toIntegralSized) | ||
import Data.ByteString qualified as BS | ||
import Data.Char (isAlphaNum) | ||
import Data.List (lookup) | ||
import Data.List qualified as L | ||
import Data.Map qualified as M | ||
import Data.Reflection (Given) | ||
import Data.Text (toCaseFold) | ||
import Data.Text qualified as T | ||
import Data.Text.Metrics (damerauLevenshteinNorm) | ||
import Data.Time (UTCTime, defaultTimeLocale, formatTime, readPTime, rfc822DateFormat) | ||
import Data.Time.Clock.POSIX (getPOSIXTime) | ||
|
@@ -66,10 +79,6 @@ import Text.URI (Authority (..), ParseExceptionBs, URI (..), mkURIBs) | |
import Time (RatioNat, Second, Time (..), ms, sec, threadDelay, timeout, (+:+), (-:-)) | ||
import URI.ByteString qualified as URIBS | ||
|
||
import Control.Exception.Safe (handleAsync, handleJust) | ||
import Data.Bits (toIntegralSized) | ||
import Data.List (lookup) | ||
import Data.Text (toCaseFold) | ||
import Xrefcheck.Config | ||
import Xrefcheck.Core | ||
import Xrefcheck.Orphans () | ||
|
@@ -255,6 +264,21 @@ instance Given ColorMode => Buildable VerifyError where | |
#{redirectedUrl} | ||
|] | ||
|
||
data CopyPasteCheckResult = CopyPasteCheckResult | ||
{ crFile :: FilePath, | ||
crOriginalRef :: Reference, | ||
crCopiedRef :: Reference | ||
Martoon-00 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} deriving stock (Show, Eq, Ord) | ||
|
||
instance (Given ColorMode) => Buildable CopyPasteCheckResult where | ||
build CopyPasteCheckResult {..} = | ||
[int|| | ||
In file #{styleIfNeeded Faint (styleIfNeeded Bold crFile)} | ||
#{crCopiedRef}\ | ||
is possibly a bad copy paste of | ||
#{crOriginalRef} | ||
|] | ||
|
||
reportVerifyErrs | ||
:: Given ColorMode => NonEmpty (WithReferenceLoc VerifyError) -> IO () | ||
reportVerifyErrs errs = fmt | ||
|
@@ -265,6 +289,17 @@ reportVerifyErrs errs = fmt | |
Invalid references dumped, #{length errs} in total. | ||
|] | ||
|
||
reportCopyPasteErrors | ||
:: Given ColorMode => NonEmpty CopyPasteCheckResult -> IO () | ||
reportCopyPasteErrors errs = fmt | ||
[int|| | ||
=== Possible copy/paste errors === | ||
|
||
#{interpolateIndentF 2 (interpolateBlockListF' "➥ " build errs)} | ||
Possible copy/paste errors dumped, #{length errs} in total. | ||
|] | ||
|
||
aeqz marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
data RetryAfter = Date UTCTime | Seconds (Time Second) | ||
deriving stock (Show, Eq) | ||
|
@@ -355,32 +390,40 @@ verifyRepo | |
-> VerifyMode | ||
-> FilePath | ||
-> RepoInfo | ||
-> IO (VerifyResult $ WithReferenceLoc VerifyError) | ||
-> IO (VerifyResult $ WithReferenceLoc VerifyError, [CopyPasteCheckResult]) | ||
verifyRepo | ||
rw | ||
config@Config{..} | ||
mode | ||
root | ||
repoInfo'@(RepoInfo files _) | ||
= do | ||
let toScan = do | ||
(file, fileInfo) <- M.toList files | ||
|
||
let filesToScan = flip mapMaybe (M.toList files) $ \(file, fileInfo) -> do | ||
guard . not $ matchesGlobPatterns root (ecIgnoreRefsFrom cExclusions) file | ||
case fileInfo of | ||
Scanned fi -> do | ||
ref <- _fiReferences fi | ||
return (file, ref) | ||
NotScannable -> empty -- No support for such file, can do nothing. | ||
NotAddedToGit -> empty -- If this file is scannable, we've notified | ||
Just (file, fi) | ||
NotScannable -> Nothing -- No support for such file, can do nothing. | ||
NotAddedToGit -> Nothing -- If this file is scannable, we've notified | ||
-- user that we are scanning only files | ||
-- added to Git while gathering RepoInfo. | ||
|
||
toCheckCopyPaste = map (second _fiReferences) filesToScan | ||
toScan = concatMap (\(file, fileInfo) -> map (file,) $ _fiReferences fileInfo) filesToScan | ||
copyPasteErrors = if scCopyPasteCheckEnabled cScanners | ||
then [ res | ||
| (file, refs) <- toCheckCopyPaste, | ||
res <- checkCopyPaste file refs | ||
] | ||
else [] | ||
Martoon-00 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
progressRef <- newIORef $ initVerifyProgress (map snd toScan) | ||
|
||
accumulated <- loopAsyncUntil (printer progressRef) do | ||
forConcurrentlyCaching toScan ifExternalThenCache $ \(file, ref) -> | ||
verifyReference config mode progressRef repoInfo' root file ref | ||
case accumulated of | ||
(, copyPasteErrors) <$> case accumulated of | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmhm, I'm worried about how it's integrated into existing verification:
Also currently this logic is quite hanging at the end, we could extract copy-paste checking to a separate function outside of But I suppose that simultaneously resolving all the 3 things I mentioned above is impossible. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh I somehow missed your first comment in this PR. Let's probably discuss it here. Regarding the progress bar, options that came to my mind:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For 3: I think we can't combine reference checks and copypaste checks in one progress bar, because the number of links for the latter check should be much fewer. We could of course say that check already passed for those links, but it's strange. And we will need the common storage for storing statuses of links, this will cause extra problems for the concurrency. It seems that there will be much more overhead than worth. |
||
Right res -> return $ fold res | ||
Left (exception, partialRes) -> do | ||
-- The user has hit Ctrl+C; display any verification errors we managed to find and exit. | ||
|
@@ -412,6 +455,41 @@ verifyRepo | |
ExternalLoc -> CacheUnderKey rLink | ||
_ -> NoCaching | ||
|
||
checkCopyPaste :: FilePath -> [Reference] -> [CopyPasteCheckResult] | ||
checkCopyPaste file refs = do | ||
let getLinkAndAnchor x = (rLink x, rAnchor x) | ||
groupedRefs = | ||
L.groupBy ((==) `on` getLinkAndAnchor) $ | ||
Martoon-00 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
sortBy (compare `on` getLinkAndAnchor) refs | ||
concatMap checkGroup groupedRefs | ||
where | ||
checkGroup :: [Reference] -> [CopyPasteCheckResult] | ||
checkGroup refsInGroup = do | ||
let mergeLinkAndAnchor ref = maybe (rLink ref) (rLink ref <>) $ rAnchor ref | ||
let refsInGroup' = flip map refsInGroup $ \ref -> | ||
Martoon-00 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
(ref, (prepareNameForCheck $ rName ref, | ||
prepareNameForCheck $ mergeLinkAndAnchor ref)) | ||
-- Most of time this will be Nothing and we won't need `others`. | ||
-- The first matching link will be shown as original. | ||
let mbSubstrRef = fst <$> find (textIsLinkSubstr . snd) refsInGroup' | ||
others = fst <$> filter (not . textIsLinkSubstr . snd) refsInGroup' | ||
Martoon-00 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
maybe [] (\substrRef -> map (CopyPasteCheckResult file substrRef) others) mbSubstrRef | ||
Martoon-00 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
textIsLinkSubstr :: (Text, Text) -> Bool | ||
textIsLinkSubstr (prepName, prepLink) = prepName `isSubSeq` prepLink | ||
|
||
prepareNameForCheck :: Text -> Text | ||
prepareNameForCheck = T.toLower . T.filter isAlphaNum | ||
|
||
isSubSeq :: Text -> Text -> Bool | ||
isSubSeq "" _str = True | ||
isSubSeq _que "" = False | ||
isSubSeq que str | ||
| qhead == shead = isSubSeq qtail stail | ||
| otherwise = isSubSeq que stail | ||
where (qhead, qtail) = T.splitAt 1 que | ||
(shead, stail) = T.splitAt 1 str | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh nice, checking on subsequences is singificantly better than checking on substrings I believe. The downside of this is that we may now detect complete nonsense like
and the smaller the text is, the higher will be the chance of false triggering. What do you think? My thoughts - spoilerOut of my head, we could split the text into subwords, letters of each subword will have to be encountered in a row in the link, while different subwords may appear separated.
Perhaps subwords don't even need to appear in the link+anchor in the same order, this should be recognized:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perfect 👍 I like this, let's do this way. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm, there are some issues with this approach: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fair 🤔 🤔 |
||
|
||
Martoon-00 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
shouldCheckLocType :: VerifyMode -> LocationType -> Bool | ||
shouldCheckLocType mode locType | ||
| isExternal locType = shouldCheckExternal mode | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
{- SPDX-FileCopyrightText: 2019 Serokell <https://serokell.io> | ||
- | ||
- SPDX-License-Identifier: MPL-2.0 | ||
-} | ||
|
||
module Test.Xrefcheck.CopyPasteCheckSpec where | ||
|
||
import Universum | ||
|
||
import Test.Tasty (TestTree, testGroup) | ||
import Test.Tasty.HUnit (Assertion, testCase, (@?=)) | ||
|
||
import Xrefcheck.Core | ||
import Xrefcheck.Verify | ||
|
||
assertUnordered :: (Show a, Ord a) => [a] -> [a] -> Assertion | ||
assertUnordered = (@?=) `on` sort | ||
|
||
testPath :: FilePath | ||
testPath = "test-path" | ||
|
||
test_copyPasteCheck :: TestTree | ||
test_copyPasteCheck = testGroup "Copypaste check" | ||
[ testCase "Detect copypaste error if there is a link with a matching name" $ do | ||
let link = "./first-file" | ||
anchor = Just "heading" | ||
differentAnchor = Nothing | ||
defPos = Position Nothing | ||
original1 = Reference "_- First - - File" link anchor defPos | ||
original2 = Reference "_- First - fi - le" link anchor defPos | ||
notCopied = Reference " Link 2 " link differentAnchor defPos | ||
copied1 = Reference " foo bar" link anchor defPos | ||
copied2 = Reference " Baz quux" link anchor defPos | ||
input = [original1, original2, notCopied, copied1, copied2] | ||
res = checkCopyPaste testPath input | ||
expectedRes = | ||
-- only first matching link is shown in the output | ||
[ CopyPasteCheckResult testPath original1 copied1 | ||
, CopyPasteCheckResult testPath original1 copied2 | ||
] | ||
res `assertUnordered` expectedRes | ||
, testCase "Succeed if there is not link with a matching name" $ do | ||
let link = "./first-file" | ||
anchor = Just "heading" | ||
defPos = Position Nothing | ||
original1 = Reference "_Foo bar" link anchor defPos | ||
original2 = Reference " Baz quux" link anchor defPos | ||
original3 = Reference " Foo qubarx" link anchor defPos | ||
input = [original1, original2, original3] | ||
res = checkCopyPaste testPath input | ||
expectedRes = [] | ||
res @?= expectedRes | ||
, testCase "Check external links" $ do | ||
let link = "https://github.com" | ||
anchor = Nothing | ||
defPos = Position Nothing | ||
original = Reference "github" link anchor defPos | ||
copied = Reference "gitlab" link anchor defPos | ||
input = [original, copied] | ||
res = checkCopyPaste testPath input | ||
expectedRes = | ||
[ CopyPasteCheckResult testPath original copied | ||
] | ||
res @?= expectedRes | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
#!/usr/bin/env bats | ||
|
||
# SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io> | ||
# | ||
# SPDX-License-Identifier: MPL-2.0 | ||
|
||
load '../helpers/bats-support/load' | ||
load '../helpers/bats-assert/load' | ||
load '../helpers/bats-file/load' | ||
load '../helpers' | ||
|
||
|
||
@test "Check possible copy-paste errors and copy-paste annotations " { | ||
to_temp xrefcheck | ||
|
||
assert_diff expected.gold | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
=== Possible copy/paste errors === | ||
aeqz marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
➥ In file second-file.md | ||
reference (relative) at src:13:1-29: | ||
- text: "Lol Kek" | ||
- link: ./first-file.md | ||
- anchor: - | ||
is possibly a bad copy paste of | ||
reference (relative) at src:7:1-34: | ||
- text: "First file" | ||
- link: ./first-file.md | ||
- anchor: - | ||
|
||
➥ In file second-file.md | ||
reference (relative) at src:14:1-30: | ||
- text: "Baz quux" | ||
- link: ./first-file.md | ||
- anchor: - | ||
is possibly a bad copy paste of | ||
reference (relative) at src:7:1-34: | ||
- text: "First file" | ||
- link: ./first-file.md | ||
- anchor: - | ||
|
||
➥ In file second-file.md | ||
reference (relative) at src:24:1-29: | ||
- text: "fdw" | ||
- link: ./first-file.md | ||
- anchor: chor | ||
is possibly a bad copy paste of | ||
reference (relative) at src:23:1-32: | ||
- text: "ff-cho" | ||
- link: ./first-file.md | ||
- anchor: chor | ||
|
||
➥ In file second-file.md | ||
reference (external) at src:29:1-28: | ||
- text: "gitlab" | ||
- link: https://github.com | ||
- anchor: - | ||
is possibly a bad copy paste of | ||
reference (external) at src:28:1-28: | ||
- text: "github" | ||
- link: https://github.com | ||
- anchor: - | ||
|
||
Possible copy/paste errors dumped, 4 in total. | ||
All repository links are valid. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm thinking about whether this should be
true
by default (which may provide false positives and force the user to take manual actions) orfalse
(in such case the user may never notice that we provide such a feature).And you know, after looking at this and at #250, I suspect that we should have two modes in
dump-config
command depending on how much strict checks the user wants to have.@YuriRomanowski @aeqz What do you think?
No need to do it here, if we agree on this I'll create a separate ticket.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
By being consistent with the aims of this project in the
README
file, I think that it should befalse
by default, but I also think that then the user may never notice about the feature.To consider other options, we could also try to add an intermediate configuration setting:
where:
error
shows them as errors.warning
shows them as warnings, which do not make the program to exit with error code.disabled
disables the feature.and the default would be
warning
.In the case of going for the different
dump-config
strictness modes, I guess that this mode argument should be mandatory. Otherwise, we would be again discussing what should be the default, and the weak one could make the user to miss some features.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm do we need this? I think that the user can look at the error decriptions and decide whether to just ignore those links, or to fix them immediately, it seems that showing them as warnings is rather useless. So, I would agree with @Martoon-00, we can just provide some modes of the default config, and using true/false for different modes perfectly suites the idea.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If I understood correctly, the doubt was in deciding whether to enable this check by default, which could produce false positives in CI, or to disable it and possibly make the user to never know about the feature.
I think that providing modes of default config can preserve the same problem, because then we would have to decide whether to set a strong one as the default and produce false positives in CI, or to set a weak one and make the user to never know about some features, although it would be easier to manage.
The point in my suggestion was to consider an intermediate option that does not produce false positives in CI, but keeps showing messages that the user can see in order to know about the feature.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmhmhm 🤔 🤔 Both your points make sense to me, I'll need some time to digest this and produce my thoughts.