Skip to content

Commit

Permalink
Native dockerfile parser (#21160)
Browse files Browse the repository at this point in the history
Adds a dockerfile parser that aims to be ~compliant with the pex-python
one, but avoids installing the latter tooling.

1. I found that `tree-sitter-dockerfile` did not support value-less
options to instructions, so I've opened a fix for this.
2. I focused on being 1:1 with the old parser to the extent that
tree-sitter could (see
#21160 (comment))
without falling back to just parsing strings. Except for ARG, because
the python variant kept quotation I opted to mirror this in rust and the
tree-sitter traversal was not useful.

2 means that we allocate, duplicate and interpolate strings
unnecessarily often. I'd like to create a cleaner more rust-y data
structure in the future, but I think keeping the parser outputs similar
is more important right now.

---------

Co-authored-by: Huon Wilson <[email protected]>
  • Loading branch information
tobni and huonw authored Jul 16, 2024
1 parent 7452afe commit a91e9f2
Show file tree
Hide file tree
Showing 22 changed files with 754 additions and 51 deletions.
2 changes: 2 additions & 0 deletions docs/notes/2.23.x.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ Support for parametrizing grouped parametrizations. (e.g. `**parametrize(resolve

Docker inference is improved. Pants can now make inferences by target address for targets supporting `pants package`, and `file` targets can be included by filename. See the [documentation on Docker dependency inference](https://www.pantsbuild.org/2.23/docs/docker#dependency-inference-support) for details

Experimental support for a rust-based dockerfile parser can be enabled via `[dockerfile-parser].use_rust_parser` option.

#### Scala

Source files no longer produce a dependency on Scala plugins. If you are using a Scala plugin that is also required by the source code (such as acyclic), please add an explicit dependency or set the `packages` field on the artifact.
Expand Down
119 changes: 81 additions & 38 deletions src/python/pants/backend/docker/subsystems/dockerfile_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
from pants.backend.python.util_rules.pex import PexRequest, VenvPex, VenvPexProcess
from pants.engine.addresses import Address
from pants.engine.fs import CreateDigest, Digest, FileContent
from pants.engine.internals.native_engine import NativeDependenciesRequest
from pants.engine.intrinsics import parse_dockerfile_info
from pants.engine.process import Process, ProcessResult
from pants.engine.rules import Get, collect_rules, rule
from pants.engine.target import (
Expand All @@ -24,8 +26,11 @@
WrappedTarget,
WrappedTargetRequest,
)
from pants.option.option_types import BoolOption
from pants.util.docutil import bin_name
from pants.util.logging import LogLevel
from pants.util.resources import read_resource
from pants.util.strutil import softwrap

_DOCKERFILE_SANDBOX_TOOL = "dockerfile_wrapper_script.py"
_DOCKERFILE_PACKAGE = "pants.backend.docker.subsystems"
Expand All @@ -41,6 +46,23 @@ class DockerfileParser(PythonToolRequirementsBase):

default_lockfile_resource = (_DOCKERFILE_PACKAGE, "dockerfile.lock")

use_rust_parser = BoolOption(
default=False,
help=softwrap(
f"""
Use the new experimental Rust-based, multithreaded, in-process dependency parser.
If you think the new behaviour is causing problems, it is recommended that you run
`{bin_name()} --dockerfile-parser-use-rust-parser=True peek :: > new-parser.json` and then
`{bin_name()} --dockerfile-parser-use-rust-parser=False peek :: > old-parser.json` and compare the
two results.
If you think there is a bug, please file an issue:
https://github.com/pantsbuild/pants/issues/new/choose.
"""
),
)


@dataclass(frozen=True)
class ParserSetup:
Expand Down Expand Up @@ -119,8 +141,59 @@ class DockerfileInfoRequest:
address: Address


async def _natively_parse_dockerfile(address: Address, digest: Digest) -> DockerfileInfo:
result = await parse_dockerfile_info(NativeDependenciesRequest(digest))
return DockerfileInfo(
address=address,
digest=digest,
source=result.source,
build_args=DockerBuildArgs.from_strings(*result.build_args, duplicates_must_match=True),
copy_source_paths=tuple(result.copy_source_paths),
copy_build_args=DockerBuildArgs.from_strings(
*result.copy_build_args, duplicates_must_match=True
),
from_image_build_args=DockerBuildArgs.from_strings(
*result.from_image_build_args, duplicates_must_match=True
),
version_tags=tuple(result.version_tags),
)


async def _legacy_parse_dockerfile(
address: Address, digest: Digest, dockerfiles: tuple[str, ...]
) -> DockerfileInfo:
result = await Get(ProcessResult, DockerfileParseRequest(digest, dockerfiles))

try:
raw_output = result.stdout.decode("utf-8")
outputs = json.loads(raw_output)
assert len(outputs) == len(dockerfiles)
except Exception as e:
raise DockerfileInfoError(
f"Unexpected failure to parse Dockerfiles: {', '.join(dockerfiles)}, "
f"for the {address} target: {e}\nDockerfile parser output:\n{raw_output}"
) from e
info = outputs[0]
return DockerfileInfo(
address=address,
digest=digest,
source=info["source"],
build_args=DockerBuildArgs.from_strings(*info["build_args"], duplicates_must_match=True),
copy_source_paths=tuple(info["copy_source_paths"]),
copy_build_args=DockerBuildArgs.from_strings(
*info["copy_build_args"], duplicates_must_match=True
),
from_image_build_args=DockerBuildArgs.from_strings(
*info["from_image_build_args"], duplicates_must_match=True
),
version_tags=tuple(info["version_tags"]),
)


@rule
async def parse_dockerfile(request: DockerfileInfoRequest) -> DockerfileInfo:
async def parse_dockerfile(
request: DockerfileInfoRequest, dockerfile_parser: DockerfileParser
) -> DockerfileInfo:
wrapped_target = await Get(
WrappedTarget, WrappedTargetRequest(request.address, description_of_origin="<infallible>")
)
Expand All @@ -139,46 +212,16 @@ async def parse_dockerfile(request: DockerfileInfoRequest) -> DockerfileInfo:
f"Internal error: Expected a single source file to Dockerfile parse request {request}, "
f"got: {dockerfiles}."
)

result = await Get(
ProcessResult,
DockerfileParseRequest(
sources.snapshot.digest,
dockerfiles,
),
)

try:
raw_output = result.stdout.decode("utf-8")
outputs = json.loads(raw_output)
assert len(outputs) == len(dockerfiles)
except Exception as e:
raise DockerfileInfoError(
f"Unexpected failure to parse Dockerfiles: {', '.join(dockerfiles)}, "
f"for the {request.address} target: {e}\nDockerfile parser output:\n{raw_output}"
) from e

info = outputs[0]
try:
return DockerfileInfo(
address=request.address,
digest=sources.snapshot.digest,
source=info["source"],
build_args=DockerBuildArgs.from_strings(
*info["build_args"], duplicates_must_match=True
),
copy_source_paths=tuple(info["copy_source_paths"]),
copy_build_args=DockerBuildArgs.from_strings(
*info["copy_build_args"], duplicates_must_match=True
),
from_image_build_args=DockerBuildArgs.from_strings(
*info["from_image_build_args"], duplicates_must_match=True
),
version_tags=tuple(info["version_tags"]),
)
if dockerfile_parser.use_rust_parser:
return await _natively_parse_dockerfile(target.address, sources.snapshot.digest)
else:
return await _legacy_parse_dockerfile(
target.address, sources.snapshot.digest, dockerfiles
)
except ValueError as e:
raise DockerfileInfoError(
f"Error while parsing {info['source']} for the {request.address} target: {e}"
f"Error while parsing {dockerfiles[0]} for the {request.address} target: {e}"
) from e


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

from textwrap import dedent
from typing import cast

import pytest

Expand All @@ -19,8 +20,13 @@
from pants.testutil.rule_runner import QueryRule, RuleRunner


@pytest.fixture(params=[pytest.param(True, id="rust"), pytest.param(False, id="legacy")])
def use_rust_parser(request) -> bool:
return cast(bool, request.param)


@pytest.fixture
def rule_runner() -> RuleRunner:
def rule_runner(use_rust_parser: bool) -> RuleRunner:
rule_runner = RuleRunner(
rules=[
*dockerfile_rules(),
Expand All @@ -31,7 +37,7 @@ def rule_runner() -> RuleRunner:
target_types=[DockerImageTarget, PexBinary],
)
rule_runner.set_options(
[],
[f"--dockerfile-parser-use-rust-parser={use_rust_parser}"],
env_inherit={"PATH", "PYENV_ROOT", "HOME"},
)
return rule_runner
Expand Down Expand Up @@ -181,7 +187,7 @@ def test_copy_source_references(rule_runner: RuleRunner) -> None:
FROM base
COPY a b /
COPY --option c/d e/f/g /h
ADD ignored
ADD ignored ignored
COPY j k /
COPY
"""
Expand All @@ -204,6 +210,7 @@ def test_baseimage_tags(rule_runner: RuleRunner) -> None:
"FROM gcr.io/tekton-releases/github.com/tektoncd/operator/cmd/kubernetes/operator:"
"v0.54.0@sha256:d1f0463b35135852308ea815c2ae54c1734b876d90288ce35828aeeff9899f9d\n"
"FROM $PYTHON_VERSION AS python\n"
"FROM python:$VERSION\n"
),
}
)
Expand All @@ -215,6 +222,7 @@ def test_baseimage_tags(rule_runner: RuleRunner) -> None:
# Stage 2 is not pinned with a tag.
"stage3 v0.54.0",
"python build-arg:PYTHON_VERSION", # Parse tag from build arg.
"stage5 $VERSION",
)


Expand Down
10 changes: 10 additions & 0 deletions src/python/pants/engine/internals/native_dep_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,13 @@ class NativeParsedJavascriptDependencies:
def __init__(self, file_imports: set[str], package_imports: set[str]):
object.__setattr__(self, "file_imports", file_imports)
object.__setattr__(self, "package_imports", package_imports)


@dataclass(frozen=True)
class NativeParsedDockerfileInfo:
source: str
build_args: tuple[str, ...] # "ARG_NAME=VALUE", ...
copy_source_paths: tuple[str, ...]
copy_build_args: tuple[str, ...] # "ARG_NAME=UPSTREAM_TARGET_ADDRESS", ...
from_image_build_args: tuple[str, ...] # "ARG_NAME=UPSTREAM_TARGET_ADDRESS", ...
version_tags: tuple[str, ...] # "STAGE TAG", ...
4 changes: 4 additions & 0 deletions src/python/pants/engine/internals/native_engine.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ from pants.engine.fs import (
)
from pants.engine.internals.docker import DockerResolveImageRequest, DockerResolveImageResult
from pants.engine.internals.native_dep_inference import (
NativeParsedDockerfileInfo,
NativeParsedJavascriptDependencies,
NativeParsedPythonDependencies,
)
Expand Down Expand Up @@ -565,6 +566,9 @@ async def interactive_process(
process: InteractiveProcess, process_execution_environment: ProcessExecutionEnvironment
) -> InteractiveProcessResult: ...
async def docker_resolve_image(request: DockerResolveImageRequest) -> DockerResolveImageResult: ...
async def parse_dockerfile_info(
deps_request: NativeDependenciesRequest,
) -> NativeParsedDockerfileInfo: ...
async def parse_python_deps(
deps_request: NativeDependenciesRequest,
) -> NativeParsedPythonDependencies: ...
Expand Down
2 changes: 2 additions & 0 deletions src/python/pants/engine/internals/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from pants.engine.internals import native_engine
from pants.engine.internals.docker import DockerResolveImageRequest, DockerResolveImageResult
from pants.engine.internals.native_dep_inference import (
NativeParsedDockerfileInfo,
NativeParsedJavascriptDependencies,
NativeParsedPythonDependencies,
)
Expand Down Expand Up @@ -180,6 +181,7 @@ def __init__(
docker_resolve_image_result=DockerResolveImageResult,
parsed_python_deps_result=NativeParsedPythonDependencies,
parsed_javascript_deps_result=NativeParsedJavascriptDependencies,
parsed_dockerfile_info_result=NativeParsedDockerfileInfo,
)
remoting_options = PyRemotingOptions(
provider=execution_options.remote_provider.value,
Expand Down
8 changes: 8 additions & 0 deletions src/python/pants/engine/intrinsics.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from pants.engine.internals import native_engine
from pants.engine.internals.docker import DockerResolveImageRequest, DockerResolveImageResult
from pants.engine.internals.native_dep_inference import (
NativeParsedDockerfileInfo,
NativeParsedJavascriptDependencies,
NativeParsedPythonDependencies,
)
Expand Down Expand Up @@ -131,6 +132,13 @@ async def docker_resolve_image(request: DockerResolveImageRequest) -> DockerReso
return await native_engine.docker_resolve_image(request)


@rule
async def parse_dockerfile_info(
deps_request: NativeDependenciesRequest,
) -> NativeParsedDockerfileInfo:
return await native_engine.parse_dockerfile_info(deps_request)


@rule
async def parse_python_deps(
deps_request: NativeDependenciesRequest,
Expand Down
13 changes: 13 additions & 0 deletions src/rust/engine/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion src/rust/engine/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ humansize = "1.1"
hyper = "0.14"
hyper-rustls = "0.24"
ignore = { git = "https://github.com/pantsbuild/ripgrep.git", rev = "0f7e0fdd00ae528745a7fea24a320cae98235341" }
indexmap = "1.9"
indexmap = { version = "1.9", features = ["std", "serde"] }
indicatif = "0.17.8"
internment = "0.6"
itertools = "0.10"
Expand Down Expand Up @@ -335,6 +335,8 @@ whoami = "1.4.1"
tree-sitter = "0.20.10"
tree-sitter-javascript = "0.20.1"
tree-sitter-python = "0.20.4"
# TODO: Waiting on https://github.com/camdencheek/tree-sitter-dockerfile/pull/58.
tree-sitter-dockerfile = { git = "https://github.com/tobni/tree-sitter-dockerfile.git", rev = "ce1cbc3537995f0d968c8bc6315b6a62d6faacde" }

# Default lints adopted by most crates in this workspace.

Expand Down
5 changes: 5 additions & 0 deletions src/rust/engine/dep_inference/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,21 @@ walkdir = { workspace = true }
tree-sitter = { workspace = true }
tree-sitter-javascript = { workspace = true }
tree-sitter-python = { workspace = true }
tree-sitter-dockerfile = { workspace = true }

[dependencies]
fnv = { workspace = true }
protos = { path = "../protos" }
indexmap = { workspace = true }
serde = { workspace = true }
serde_derive = { workspace = true }
itertools = { workspace = true }
tree-sitter = { workspace = true }
tree-sitter-javascript = { workspace = true }
tree-sitter-python = { workspace = true }
tree-sitter-dockerfile = { workspace = true }
lazy_static = { workspace = true }
regex = { workspace = true }

[lints]
workspace = true
6 changes: 6 additions & 0 deletions src/rust/engine/dep_inference/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,12 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
&source_dir,
out_dir,
)?;
gen_files_for_language(
tree_sitter_dockerfile::language(),
"dockerfile",
&source_dir,
out_dir,
)?;
println!("cargo:rerun-if-env-changed=PANTS_PRINT_IMPL_HASHES");
println!("cargo:rerun-if-changed=build.rs");
println!("cargo:rerun-if-changed=src");
Expand Down
11 changes: 11 additions & 0 deletions src/rust/engine/dep_inference/src/code.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// Copyright 2024 Pants project contributors (see CONTRIBUTORS.md).
// Licensed under the Apache License, Version 2.0 (see LICENSE).
use tree_sitter::{Node, Range};

pub fn at<'a>(code: &'a str, node: Node) -> &'a str {
at_range(code, node.range())
}

pub fn at_range(code: &str, range: Range) -> &str {
&code[range.start_byte..range.end_byte]
}
Loading

0 comments on commit a91e9f2

Please sign in to comment.