diff options
| author | eduartua <eduartua@gmail.com> | 2019-01-31 13:36:01 -0600 |
|---|---|---|
| committer | eduartua <eduartua@gmail.com> | 2019-01-31 13:36:01 -0600 |
| commit | 0bacd3d192425ee2d5bc2387ec8f000a5b76a22b (patch) | |
| tree | 7b16d8046e3233ac5f48119af0b253a87f441a10 /contributors | |
| parent | 1e4beeda12d7f1191cfc53c821f134b00951da00 (diff) | |
| parent | b7b910a2351301d39c8de38ff0156aa9aad88d42 (diff) | |
Some files had conflict with some URL - fixed now
Diffstat (limited to 'contributors')
75 files changed, 3931 insertions, 3200 deletions
diff --git a/contributors/design-proposals/OWNERS b/contributors/design-proposals/OWNERS index c6a712b8..7bda97c6 100644 --- a/contributors/design-proposals/OWNERS +++ b/contributors/design-proposals/OWNERS @@ -1,3 +1,5 @@ +# See the OWNERS docs at https://go.k8s.io/owners + reviewers: - brendandburns - dchen1107 diff --git a/contributors/design-proposals/api-machinery/OWNERS b/contributors/design-proposals/api-machinery/OWNERS index 0df76e64..ef142b0f 100644 --- a/contributors/design-proposals/api-machinery/OWNERS +++ b/contributors/design-proposals/api-machinery/OWNERS @@ -1,3 +1,5 @@ +# See the OWNERS docs at https://go.k8s.io/owners + reviewers: - sig-api-machinery-leads approvers: diff --git a/contributors/design-proposals/apps/OWNERS b/contributors/design-proposals/apps/OWNERS index 12723930..f36b2fcd 100644 --- a/contributors/design-proposals/apps/OWNERS +++ b/contributors/design-proposals/apps/OWNERS @@ -1,3 +1,5 @@ +# See the OWNERS docs at https://go.k8s.io/owners + reviewers: - sig-apps-leads approvers: diff --git a/contributors/design-proposals/architecture/OWNERS b/contributors/design-proposals/architecture/OWNERS index 87364abb..3baa861d 100644 --- a/contributors/design-proposals/architecture/OWNERS +++ b/contributors/design-proposals/architecture/OWNERS @@ -1,3 +1,5 @@ +# See the OWNERS docs at https://go.k8s.io/owners + reviewers: - sig-architecture-leads - jbeda diff --git a/contributors/design-proposals/auth/OWNERS b/contributors/design-proposals/auth/OWNERS index 3100c753..ef998d7e 100644 --- a/contributors/design-proposals/auth/OWNERS +++ b/contributors/design-proposals/auth/OWNERS @@ -1,3 +1,5 @@ +# See the OWNERS docs at https://go.k8s.io/owners + reviewers: - sig-auth-leads approvers: diff --git a/contributors/design-proposals/autoscaling/OWNERS b/contributors/design-proposals/autoscaling/OWNERS index 17089492..9a70bb4c 100644 --- a/contributors/design-proposals/autoscaling/OWNERS +++ b/contributors/design-proposals/autoscaling/OWNERS @@ -1,3 +1,5 @@ +# See the OWNERS docs at https://go.k8s.io/owners + reviewers: - sig-autoscaling-leads approvers: diff --git a/contributors/design-proposals/aws/OWNERS b/contributors/design-proposals/aws/OWNERS index 83317bbe..cc03b55d 100644 --- a/contributors/design-proposals/aws/OWNERS +++ b/contributors/design-proposals/aws/OWNERS @@ -1,3 +1,5 @@ +# See the OWNERS docs at https://go.k8s.io/owners + reviewers: - sig-aws-leads approvers: diff --git a/contributors/design-proposals/cli/OWNERS b/contributors/design-proposals/cli/OWNERS index 248d3e7c..96fdea25 100644 --- a/contributors/design-proposals/cli/OWNERS +++ b/contributors/design-proposals/cli/OWNERS @@ -1,3 +1,5 @@ +# See the OWNERS docs at https://go.k8s.io/owners + reviewers: - sig-cli-leads approvers: diff --git a/contributors/design-proposals/cluster-lifecycle/OWNERS b/contributors/design-proposals/cluster-lifecycle/OWNERS index d69f24ee..71322d9e 100644 --- a/contributors/design-proposals/cluster-lifecycle/OWNERS +++ b/contributors/design-proposals/cluster-lifecycle/OWNERS @@ -1,3 +1,5 @@ +# See the OWNERS docs at https://go.k8s.io/owners + reviewers: - sig-cluster-lifecycle-leads approvers: diff --git a/contributors/design-proposals/cluster-lifecycle/clustering/OWNERS b/contributors/design-proposals/cluster-lifecycle/clustering/OWNERS index b3d71823..741be590 100644 --- a/contributors/design-proposals/cluster-lifecycle/clustering/OWNERS +++ b/contributors/design-proposals/cluster-lifecycle/clustering/OWNERS @@ -1,3 +1,5 @@ +# See the OWNERS docs at https://go.k8s.io/owners + reviewers: - michelleN approvers: diff --git a/contributors/design-proposals/gcp/OWNERS b/contributors/design-proposals/gcp/OWNERS index cd2232f4..4ff966b4 100644 --- a/contributors/design-proposals/gcp/OWNERS +++ b/contributors/design-proposals/gcp/OWNERS @@ -1,3 +1,5 @@ +# See the OWNERS docs at https://go.k8s.io/owners + reviewers: - sig-gcp-leads approvers: diff --git a/contributors/design-proposals/instrumentation/OWNERS b/contributors/design-proposals/instrumentation/OWNERS index 8e29eafa..3e1efb0c 100644 --- a/contributors/design-proposals/instrumentation/OWNERS +++ b/contributors/design-proposals/instrumentation/OWNERS @@ -1,3 +1,5 @@ +# See the OWNERS docs at https://go.k8s.io/owners + reviewers: - sig-instrumentation-leads approvers: diff --git a/contributors/design-proposals/multicluster/OWNERS b/contributors/design-proposals/multicluster/OWNERS index fca0e564..bedef962 100644 --- a/contributors/design-proposals/multicluster/OWNERS +++ b/contributors/design-proposals/multicluster/OWNERS @@ -1,3 +1,5 @@ +# See the OWNERS docs at https://go.k8s.io/owners + reviewers: - sig-multicluster-leads approvers: diff --git a/contributors/design-proposals/network/OWNERS b/contributors/design-proposals/network/OWNERS index 1939ca5c..42bb9ad2 100644 --- a/contributors/design-proposals/network/OWNERS +++ b/contributors/design-proposals/network/OWNERS @@ -1,3 +1,5 @@ +# See the OWNERS docs at https://go.k8s.io/owners + reviewers: - sig-network-leads approvers: diff --git a/contributors/design-proposals/node/OWNERS b/contributors/design-proposals/node/OWNERS index ab6d8dd5..810bc689 100644 --- a/contributors/design-proposals/node/OWNERS +++ b/contributors/design-proposals/node/OWNERS @@ -1,3 +1,5 @@ +# See the OWNERS docs at https://go.k8s.io/owners + reviewers: - sig-node-leads approvers: diff --git a/contributors/design-proposals/release/OWNERS b/contributors/design-proposals/release/OWNERS index 9d8e7403..c414be94 100644 --- a/contributors/design-proposals/release/OWNERS +++ b/contributors/design-proposals/release/OWNERS @@ -1,3 +1,5 @@ +# See the OWNERS docs at https://go.k8s.io/owners + reviewers: - sig-release-leads approvers: diff --git a/contributors/design-proposals/resource-management/OWNERS b/contributors/design-proposals/resource-management/OWNERS index 60221854..d717eba7 100644 --- a/contributors/design-proposals/resource-management/OWNERS +++ b/contributors/design-proposals/resource-management/OWNERS @@ -1,3 +1,5 @@ +# See the OWNERS docs at https://go.k8s.io/owners + reviewers: - wg-resource-management-leads approvers: diff --git a/contributors/design-proposals/scalability/OWNERS b/contributors/design-proposals/scalability/OWNERS index 2b68b875..6b57aa45 100644 --- a/contributors/design-proposals/scalability/OWNERS +++ b/contributors/design-proposals/scalability/OWNERS @@ -1,3 +1,5 @@ +# See the OWNERS docs at https://go.k8s.io/owners + reviewers: - sig-scalability-leads approvers: diff --git a/contributors/design-proposals/scheduling/OWNERS b/contributors/design-proposals/scheduling/OWNERS index b3248766..f6155ab6 100644 --- a/contributors/design-proposals/scheduling/OWNERS +++ b/contributors/design-proposals/scheduling/OWNERS @@ -1,3 +1,5 @@ +# See the OWNERS docs at https://go.k8s.io/owners + reviewers: - sig-scheduling-leads approvers: diff --git a/contributors/design-proposals/scheduling/images/OWNERS b/contributors/design-proposals/scheduling/images/OWNERS index fe173c27..14c05899 100644 --- a/contributors/design-proposals/scheduling/images/OWNERS +++ b/contributors/design-proposals/scheduling/images/OWNERS @@ -1,3 +1,5 @@ +# See the OWNERS docs at https://go.k8s.io/owners + reviewers: - bsalamat - michelleN diff --git a/contributors/design-proposals/scheduling/scheduler_extender.md b/contributors/design-proposals/scheduling/scheduler_extender.md index de7a6259..bc65f9ba 100644 --- a/contributors/design-proposals/scheduling/scheduler_extender.md +++ b/contributors/design-proposals/scheduling/scheduler_extender.md @@ -2,7 +2,7 @@ There are three ways to add new scheduling rules (predicates and priority functions) to Kubernetes: (1) by adding these rules to the scheduler and -recompiling, [described here](/contributors/devel/scheduler.md), +recompiling, [described here](/contributors/devel/sig-scheduling/scheduler.md), (2) implementing your own scheduler process that runs instead of, or alongside of, the standard Kubernetes scheduler, (3) implementing a "scheduler extender" process that the standard Kubernetes scheduler calls out to as a final pass when diff --git a/contributors/design-proposals/service-catalog/OWNERS b/contributors/design-proposals/service-catalog/OWNERS index 5c6b18ed..a4884d4d 100644 --- a/contributors/design-proposals/service-catalog/OWNERS +++ b/contributors/design-proposals/service-catalog/OWNERS @@ -1,3 +1,5 @@ +# See the OWNERS docs at https://go.k8s.io/owners + reviewers: - sig-service-catalog-leads approvers: diff --git a/contributors/design-proposals/storage/OWNERS b/contributors/design-proposals/storage/OWNERS index fb58418f..6dd5158f 100644 --- a/contributors/design-proposals/storage/OWNERS +++ b/contributors/design-proposals/storage/OWNERS @@ -1,3 +1,5 @@ +# See the OWNERS docs at https://go.k8s.io/owners + reviewers: - sig-storage-leads approvers: diff --git a/contributors/design-proposals/storage/container-storage-interface-pod-information.md b/contributors/design-proposals/storage/container-storage-interface-pod-information.md new file mode 100644 index 00000000..872f9d45 --- /dev/null +++ b/contributors/design-proposals/storage/container-storage-interface-pod-information.md @@ -0,0 +1,48 @@ +# Pod in CSI NodePublish request +Author: @jsafrane + +## Goal +* Pass Pod information (pod name/namespace/UID + service account) to CSI drivers in `NodePublish` request as CSI volume attributes. + +## Motivation +We'd like to move away from exec based Flex to gRPC based CSI volumes. In Flex, kubelet always passes `pod.namespace`, `pod.name`, `pod.uid` and `pod.spec.serviceAccountName` ("pod information") in every `mount` call. In Kubernetes community we've seen some Flex drivers that use pod or service account information to authorize or audit usage of a volume or generate content of the volume tailored to the pod (e.g. https://github.com/Azure/kubernetes-keyvault-flexvol). + +CSI is agnostic to container orchestrators (such as Kubernetes, Mesos or CloudFoundry) and as such does not understand concept of pods and service accounts. [Enhancement of CSI protocol](https://github.com/container-storage-interface/spec/pull/252) to pass "workload" (~pod) information from Kubernetes to CSI driver has met some resistance. + +## High-level design +We decided to pass the pod information as `NodePublishVolumeRequest.volume_attributes`. + +* Kubernetes passes pod information only to CSI drivers that explicitly require that information in their [`CSIDriver` instance](https://github.com/kubernetes/community/pull/2523). These drivers are tightly coupled to Kubernetes and may not work or may require reconfiguration on other cloud orchestrators. It is expected (but not limited to) that these drivers will provide ephemeral volumes similar to Secrets or ConfigMap, extending Kubernetes secret or configuration sources. +* Kubernetes will not pass pod information to CSI drivers that don't know or don't care about pods and service accounts. It is expected (but not limited to) that these drivers will provide real persistent storage. Such CSI driver would reject a CSI call with pod information as invalid. This is current behavior of Kubernetes and it will be the default behavior. + +## Detailed design + +### API changes +No API changes. + +### CSI enhancement +We don't need to change CSI protocol in any way. It allows kubelet to pass `pod.name`, `pod.uid` and `pod.spec.serviceAccountName` in [`NodePublish` call as `volume_attributes`]((https://github.com/container-storage-interface/spec/blob/master/spec.md#nodepublishvolume)). `NodePublish` is roughly equivalent to Flex `mount` call. + +The only thing we need to do is to **define** names of the `volume_attributes` keys that CSI drivers can expect: + * `csi.storage.k8s.io/pod.name`: name of the pod that wants the volume. + * `csi.storage.k8s.io/pod.namespace`: namespace of the pod that wants the volume. + * `csi.storage.k8s.io/pod.uid`: uid of the pod that wants the volume. + * `csi.storage.k8s.io/serviceAccount.name`: name of the service account under which the pod operates. Namespace of the service account is the same as `pod.namespace`. + +Note that these attribute names are very similar to [parameters we pass to flex volume plugin](https://github.com/kubernetes/kubernetes/blob/10688257e63e4d778c499ba30cddbc8c6219abe9/pkg/volume/flexvolume/driver-call.go#L55). + +### Kubelet +Kubelet needs to create informer to cache `CSIDriver` instances. It passes the informer to CSI volume plugin as a new argument of [`ProbeVolumePlugins`](https://github.com/kubernetes/kubernetes/blob/43f805b7bdda7a5b491d34611f85c249a63d7f97/pkg/volume/csi/csi_plugin.go#L58). + +### CSI volume plugin +In `SetUpAt()`, the CSI volume plugin checks the `CSIDriver` informer if `CSIDriver` instance exists for a particular CSI driver that handles the volume. If the instance exists and has `PodInfoRequiredOnMount` set, the volume plugin adds `csi.storage.k8s.io/*` attributes to `volume_attributes` of the CSI volume. It blindly overwrites any existing values there. + +Kubelet and the volume plugin must tolerate when CRD for `CSIDriver` is not created (yet). Kubelet and CSI volume plugin falls back to original behavior, i.e. does not pass any pod information to CSI. We expect that CSI drivers will return reasonable error code instead of mounting a wrong volume. + +TODO(jsafrane): check what (shared?) informer does when it's created for non-existing CRD. Will it start working automatically when the CRD is created? Or shall we retry creation of the informer every X seconds until the CRD is created? Alternatively, we may GEt fresh `CSIDriver` from API server in `SetUpAt()`, without any informer. + +## Implementation + +* Alpha in 1.12 (behind `CSIPodInfo` feature gate) +* Beta in 1.13 (behind `CSIPodInfo` feature gate) +* GA 1.14? diff --git a/contributors/design-proposals/storage/container-storage-interface.md b/contributors/design-proposals/storage/container-storage-interface.md index a331fa0b..e368b4ac 100644 --- a/contributors/design-proposals/storage/container-storage-interface.md +++ b/contributors/design-proposals/storage/container-storage-interface.md @@ -29,7 +29,7 @@ Kubernetes volume plugins are currently “in-tree” meaning they are linked, c 4. Volume plugins get full privileges of kubernetes components (kubelet and kube-controller-manager). 5. Plugin developers are forced to make plugin source code available, and can not choose to release just a binary. -The existing [Flex Volume](/contributors/devel/flexvolume.md) plugin attempted to address this by exposing an exec based API for mount/unmount/attach/detach. Although it enables third party storage vendors to write drivers out-of-tree, it requires access to the root filesystem of node and master machines in order to deploy the third party driver files. +The existing [Flex Volume] plugin attempted to address this by exposing an exec based API for mount/unmount/attach/detach. Although it enables third party storage vendors to write drivers out-of-tree, it requires access to the root filesystem of node and master machines in order to deploy the third party driver files. Additionally, it doesn’t address another pain of in-tree volumes plugins: dependencies. Volume plugins tend to have many external requirements: dependencies on mount and filesystem tools, for example. These dependencies are assumed to be available on the underlying host OS, which often is not the case, and installing them requires direct machine access. There are efforts underway, for example https://github.com/kubernetes/community/pull/589, that are hoping to address this for in-tree volume plugins. But, enabling volume plugins to be completely containerized will make dependency management much easier. @@ -56,7 +56,7 @@ The objective of this document is to document all the requirements for enabling * Recommend deployment process for Kubernetes compatible, third-party CSI Volume drivers on a Kubernetes cluster. ## Non-Goals -* Replace [Flex Volume plugin](/contributors/devel/flexvolume.md) +* Replace [Flex Volume plugin] * The Flex volume plugin exists as an exec based mechanism to create “out-of-tree” volume plugins. * Because Flex drivers exist and depend on the Flex interface, it will continue to be supported with a stable API. * The CSI Volume plugin will co-exist with Flex volume plugin. @@ -777,3 +777,7 @@ Instead of creating a new `VolumeAttachment` object, another option we considere * List of nodes the volume was successfully attached to. We dismissed this approach because having attach/detach triggered by the creation/deletion of an object is much easier to manage (for both external-attacher and Kubernetes) and more robust (fewer corner cases to worry about). + + +[Flex Volume]: /contributors/devel/sig-storage/flexvolume.md +[Flex Volume plugin]: /contributors/devel/sig-storage/flexvolume.md
\ No newline at end of file diff --git a/contributors/design-proposals/storage/flexvolume-deployment.md b/contributors/design-proposals/storage/flexvolume-deployment.md index 0b40748b..19b7ea63 100644 --- a/contributors/design-proposals/storage/flexvolume-deployment.md +++ b/contributors/design-proposals/storage/flexvolume-deployment.md @@ -10,7 +10,7 @@ Beginning in version 1.8, the Kubernetes Storage SIG is putting a stop to accept [CSI](https://github.com/container-storage-interface/spec/blob/master/spec.md) provides a single interface that storage vendors can implement in order for their storage solutions to work across many different container orchestrators, and volume plugins are out-of-tree by design. This is a large effort, the full implementation of CSI is several quarters away, and there is a need for an immediate solution for storage vendors to continue adding volume plugins. -[Flexvolume](/contributors/devel/flexvolume.md) is an in-tree plugin that has the ability to run any storage solution by executing volume commands against a user-provided driver on the Kubernetes host, and this currently exists today. However, the process of setting up Flexvolume is very manual, pushing it out of consideration for many users. Problems include having to copy the driver to a specific location in each node, manually restarting kubelet, and user's limited access to machines. +[Flexvolume] is an in-tree plugin that has the ability to run any storage solution by executing volume commands against a user-provided driver on the Kubernetes host, and this currently exists today. However, the process of setting up Flexvolume is very manual, pushing it out of consideration for many users. Problems include having to copy the driver to a specific location in each node, manually restarting kubelet, and user's limited access to machines. An automated deployment technique is discussed in [Recommended Driver Deployment Method](#recommended-driver-deployment-method). The crucial change required to enable this method is allowing kubelet and controller manager to dynamically discover plugin changes. @@ -164,3 +164,5 @@ Cons: Does not guarantee every node has a pod running. Pod anti-affinity can be * How does this system work with containerized kubelet? * Are there any SELinux implications? + +[Flexvolume]: /contributors/devel/sig-storage/flexvolume.md
\ No newline at end of file diff --git a/contributors/design-proposals/testing/OWNERS b/contributors/design-proposals/testing/OWNERS index 48c9f03c..541bac08 100644 --- a/contributors/design-proposals/testing/OWNERS +++ b/contributors/design-proposals/testing/OWNERS @@ -1,3 +1,5 @@ +# See the OWNERS docs at https://go.k8s.io/owners + reviewers: - sig-testing-leads approvers: diff --git a/contributors/devel/OWNERS b/contributors/devel/OWNERS index c4d35842..4b7cccf3 100644 --- a/contributors/devel/OWNERS +++ b/contributors/devel/OWNERS @@ -1,3 +1,5 @@ +# See the OWNERS docs at https://go.k8s.io/owners + reviewers: - calebamiles - cblecker @@ -5,6 +7,7 @@ reviewers: - idvoretskyi - Phillels - spiffxp + - guineveresaenger approvers: - calebamiles - cblecker diff --git a/contributors/devel/README.md b/contributors/devel/README.md index 16d5fcda..31c0bcac 100644 --- a/contributors/devel/README.md +++ b/contributors/devel/README.md @@ -15,7 +15,7 @@ Guide](http://kubernetes.io/docs/admin/). * **Pull Request Process** ([/contributors/guide/pull-requests.md](/contributors/guide/pull-requests.md)): When and why pull requests are closed. -* **Getting Recent Builds** ([getting-builds.md](getting-builds.md)): How to get recent builds including the latest builds that pass CI. +* **Getting Recent Builds** ([getting-builds.md](sig-release/getting-builds.md)): How to get recent builds including the latest builds that pass CI. * **Automated Tools** ([automation.md](automation.md)): Descriptions of the automation that is running on our github repository. @@ -24,20 +24,20 @@ Guide](http://kubernetes.io/docs/admin/). * **Development Guide** ([development.md](development.md)): Setting up your development environment. -* **Testing** ([testing.md](testing.md)): How to run unit, integration, and end-to-end tests in your development sandbox. +* **Testing** ([testing.md](sig-testing/testing.md)): How to run unit, integration, and end-to-end tests in your development sandbox. * **Conformance Testing** ([conformance-tests.md](sig-architecture/conformance-tests.md)) What is conformance testing and how to create/manage them. -* **Hunting flaky tests** ([flaky-tests.md](flaky-tests.md)): We have a goal of 99.9% flake free tests. +* **Hunting flaky tests** ([flaky-tests.md](sig-testing/flaky-tests.md)): We have a goal of 99.9% flake free tests. Here's how to run your tests many times. -* **Logging Conventions** ([logging.md](logging.md)): Glog levels. +* **Logging Conventions** ([logging.md](sig-instrumentation/logging.md)): Glog levels. -* **Profiling Kubernetes** ([profiling.md](profiling.md)): How to plug in go pprof profiler to Kubernetes. +* **Profiling Kubernetes** ([profiling.md](sig-scalability/profiling.md)): How to plug in go pprof profiler to Kubernetes. * **Instrumenting Kubernetes with a new metric** - ([instrumentation.md](instrumentation.md)): How to add a new metrics to the + ([instrumentation.md](sig-instrumentation/instrumentation.md)): How to add a new metrics to the Kubernetes code base. * **Coding Conventions** ([coding-conventions.md](../guide/coding-conventions.md)): diff --git a/contributors/devel/api_changes.md b/contributors/devel/api_changes.md index 3d6bbf63..cab9130f 100644 --- a/contributors/devel/api_changes.md +++ b/contributors/devel/api_changes.md @@ -1,3 +1,3 @@ This file has moved to https://git.k8s.io/community/contributors/devel/sig-architecture/api_changes.md. -This file is a placeholder to preserve links. Please remove by April 24, 2019 or the release of kubernetes 1.13, whichever comes first.
\ No newline at end of file +This file is a placeholder to preserve links. Please remove by April 24, 2019 or the release of kubernetes 1.13, whichever comes first. diff --git a/contributors/devel/bazel.md b/contributors/devel/bazel.md index 991a0ac2..502d32ee 100644 --- a/contributors/devel/bazel.md +++ b/contributors/devel/bazel.md @@ -1,184 +1,3 @@ -# Build and test with Bazel +This file has moved to https://git.k8s.io/community/contributors/devel/sig-testing/bazel.md. -Building and testing Kubernetes with Bazel is supported but not yet default. - -Bazel is used to run all Kubernetes PRs on [Prow](https://prow.k8s.io), -as remote caching enables significantly reduced build and test times. - -Some repositories (such as kubernetes/test-infra) have switched to using Bazel -exclusively for all build, test, and release workflows. - -Go rules are managed by the [`gazelle`](https://github.com/bazelbuild/rules_go/tree/master/go/tools/gazelle) -tool, with some additional rules managed by the [`kazel`](https://git.k8s.io/repo-infra/kazel) tool. -These tools are called via the `hack/update-bazel.sh` script. - -Instructions for installing Bazel -can be found [here](https://www.bazel.io/versions/master/docs/install.html). - -Several convenience `make` rules have been created for common operations: - -* `make bazel-build`: builds all binaries in tree (`bazel build -- //... - -//vendor/...`) -* `make bazel-test`: runs all unit tests (`bazel test --config=unit -- //... - //hack:verify-all -//build/... -//vendor/...`) -* `make bazel-test-integration`: runs all integration tests (`bazel test - --config integration //test/integration/...`) -* `make bazel-release`: builds release tarballs, Docker images (for server - components), and Debian images (`bazel build //build/release-tars`) - -You can also interact with Bazel directly; for example, to run all `kubectl` unit -tests, run - -```console -$ bazel test //pkg/kubectl/... -``` - -## Planter -If you don't want to install Bazel, you can instead try using the unofficial -[Planter](https://git.k8s.io/test-infra/planter) tool, -which runs Bazel inside a Docker container. - -For example, you can run -```console -$ ../test-infra/planter/planter.sh make bazel-test -$ ../test-infra/planter/planter.sh bazel build //cmd/kubectl -``` - -## Continuous Integration - -There are several bazel CI jobs: -* [ci-kubernetes-bazel-build](http://k8s-testgrid.appspot.com/google-unit#bazel-build): builds everything - with Bazel -* [ci-kubernetes-bazel-test](http://k8s-testgrid.appspot.com/google-unit#bazel-test): runs unit tests in - with Bazel - -Similar jobs are run on all PRs; additionally, several of the e2e jobs use -Bazel-built binaries when launching and testing Kubernetes clusters. - -## Updating `BUILD` files - -To update `BUILD` files, run: - -```console -$ ./hack/update-bazel.sh -``` - -To prevent Go rules from being updated, consult the [gazelle -documentation](https://github.com/bazelbuild/rules_go/tree/master/go/tools/gazelle). - -Note that much like Go files and `gofmt`, `BUILD` files have standardized, -opinionated style rules, and running `hack/update-bazel.sh` will format them for you. - -If you want to auto-format `BUILD` files in your editor, use of -[Buildifier](https://github.com/bazelbuild/buildtools/blob/master/buildifier/README.md) -is recommended. - -Updating the `BUILD` file for a package will be required when: -* Files are added to or removed from a package -* Import dependencies change for a package -* A `BUILD` file has been updated and needs to be reformatted -* A new `BUILD` file has been added (parent `BUILD` files will be updated) - -## Known issues and limitations - -### [Cross-compilation of cgo is not currently natively supported](https://github.com/bazelbuild/rules_go/issues/1020) -All binaries are currently built for the host OS and architecture running Bazel. -(For example, you can't currently target linux/amd64 from macOS or linux/s390x -from an amd64 machine.) - -The Go rules support cross-compilation of pure Go code using the `--platforms` -flag, and this is being used successfully in the kubernetes/test-infra repo. - -It may already be possible to cross-compile cgo code if a custom CC toolchain is -set up, possibly reusing the kube-cross Docker image, but this area needs -further exploration. - -### The CC toolchain is not fully hermetic -Bazel requires several tools and development packages to be installed in the system, including `gcc`, `g++`, `glibc and libstdc++ development headers` and `glibc static development libraries`. Please check your distribution for exact names of the packages. Examples for some commonly used distributions are below: - -| Dependency | Debian/Ubuntu | CentOS | OpenSuSE | -|:---------------------:|-------------------------------|--------------------------------|-----------------------------------------| -| Build essentials | `apt install build-essential` | `yum groupinstall development` | `zypper install -t pattern devel_C_C++` | -| GCC C++ | `apt install g++` | `yum install gcc-c++` | `zypper install gcc-c++` | -| GNU Libc static files | `apt install libc6-dev` | `yum install glibc-static` | `zypper install glibc-devel-static` | - -If any of these packages change, they may also cause spurious build failures -as described in [this issue](https://github.com/bazelbuild/bazel/issues/4907). - -An example error might look something like -``` -ERROR: undeclared inclusion(s) in rule '//vendor/golang.org/x/text/cases:go_default_library.cgo_c_lib': -this rule is missing dependency declarations for the following files included by 'vendor/golang.org/x/text/cases/linux_amd64_stripped/go_default_library.cgo_codegen~/_cgo_export.c': - '/usr/lib/gcc/x86_64-linux-gnu/7/include/stddef.h' -``` - -The only way to recover from this error is to force Bazel to regenerate its -automatically-generated CC toolchain configuration by running `bazel clean ---expunge`. - -Improving cgo cross-compilation may help with all of this. - -### Changes to Go imports requires updating BUILD files -The Go rules in `BUILD` and `BUILD.bazel` files must be updated any time files -are added or removed or Go imports are changed. These rules are automatically -maintained by `gazelle`, which is run via `hack/update-bazel.sh`, but this is -still a source of friction. - -[Autogazelle](https://github.com/bazelbuild/bazel-gazelle/tree/master/cmd/autogazelle) -is a new experimental tool which may reduce or remove the need for developers -to run `hack/update-bazel.sh`, but no work has yet been done to support it in -kubernetes/kubernetes. - -### Code coverage support is incomplete for Go -Bazel and the Go rules have limited support for code coverage. Running something -like `bazel coverage -- //... -//vendor/...` will run tests in coverage mode, -but no report summary is currently generated. It may be possible to combine -`bazel coverage` with -[Gopherage](https://github.com/kubernetes/test-infra/tree/master/gopherage), -however. - -### Kubernetes code generators are not fully supported -The make-based build system in kubernetes/kubernetes runs several code -generators at build time: -* [conversion-gen](https://github.com/kubernetes/code-generator/tree/master/cmd/conversion-gen) -* [deepcopy-gen](https://github.com/kubernetes/code-generator/tree/master/cmd/deepcopy-gen) -* [defaulter-gen](https://github.com/kubernetes/code-generator/tree/master/cmd/defaulter-gen) -* [openapi-gen](https://github.com/kubernetes/kube-openapi/tree/master/cmd/openapi-gen) -* [go-bindata](https://github.com/jteeuwen/go-bindata/tree/master/go-bindata) - -Of these, only `openapi-gen` and `go-bindata` are currently supported when -building Kubernetes with Bazel. - -The `go-bindata` generated code is produced by hand-written genrules. - -The other code generators use special build tags of the form `// -+k8s:generator-name=arg`; for example, input files to the openapi-gen tool are -specified with `// +k8s:openapi-gen=true`. - -`kazel` is used to find all packages that require OpenAPI generation, and then a -handwritten genrule consumes this list of packages to run `openapi-gen`. - -For `openapi-gen`, a single output file is produced in a single Go package, which -makes this fairly compatible with Bazel. -All other Kubernetes code generators generally produce one output file per input -package, which is less compatible with the Bazel workflow. - -The make-based build system batches up all input packages into one call to the -code generator binary, but this is inefficient for Bazel's incrementality, as a -change in one package may result in unnecessarily recompiling many other -packages. -On the other hand, calling the code generator binary multiple times is less -efficient than calling it once, since many of the generators parse the tree for -Go type information and other metadata. - -One additional challenge is that many of the code generators add additional -Go imports which `gazelle` (and `autogazelle`) cannot infer, and so they must be -explicitly added as dependencies in the `BUILD` files. - -Kubernetes has even more code generators than this limited list, but the rest -are generally run as `hack/update-*.sh` scripts and checked into the repository, -and so are not immediately needed for Bazel parity. - -## Contacts -For help or discussion, join the [#bazel](https://kubernetes.slack.com/messages/bazel) -channel on Kubernetes Slack. +This file is a placeholder to preserve links. Please remove by April 30, 2019 or the release of kubernetes 1.13, whichever comes first.
\ No newline at end of file diff --git a/contributors/devel/cherry-picks.md b/contributors/devel/cherry-picks.md index 7769f970..f7284c73 100644 --- a/contributors/devel/cherry-picks.md +++ b/contributors/devel/cherry-picks.md @@ -1,73 +1,3 @@ -# Overview +This file has moved to https://git.k8s.io/community/contributors/devel/sig-release/cherry-picks.md. -This document explains how cherry-picks are managed on release branches within -the kubernetes/kubernetes repository. -A common use case for this task is backporting PRs from master to release -branches. - -## Prerequisites - * [Contributor License Agreement](http://git.k8s.io/community/CLA.md) is - considered implicit for all code within cherry-pick pull requests, - **unless there is a large conflict**. - * A pull request merged against the master branch. - * [Release branch](https://git.k8s.io/release/docs/branching.md) exists. - * The normal git and GitHub configured shell environment for pushing to your - kubernetes `origin` fork on GitHub and making a pull request against a - configured remote `upstream` that tracks - "https://github.com/kubernetes/kubernetes.git", including `GITHUB_USER`. - * Have `hub` installed, which is most easily installed via `go get - github.com/github/hub` assuming you have a standard golang development - environment. - -## Initiate a Cherry-pick - * Run the [cherry-pick - script](https://git.k8s.io/kubernetes/hack/cherry_pick_pull.sh). - This example applies a master branch PR #98765 to the remote branch - `upstream/release-3.14`: `hack/cherry_pick_pull.sh upstream/release-3.14 - 98765` - * Be aware the cherry-pick script assumes you have a git remote called - `upstream` that points at the Kubernetes github org. - Please see our [recommended Git workflow](https://git.k8s.io/community/contributors/guide/github-workflow.md#workflow). - * You will need to run the cherry-pick script separately for each patch release you want to cherry-pick to. - - * Your cherry-pick PR will immediately get the `do-not-merge/cherry-pick-not-approved` label. - The [Branch Manager](https://git.k8s.io/sig-release/release-team/role-handbooks/branch-manager) - will triage PRs targeted to the next .0 minor release branch up until the - release, while the [Patch Release Team](https://git.k8s.io/sig-release/release-team/role-handbooks/patch-release-manager) - will handle all cherry-picks to patch releases. - Normal rules apply for code merge. - * Reviewers `/lgtm` and owners `/approve` as they deem appropriate. - * Milestones on cherry-pick PRs should be the milestone for the target - release branch (for example, milestone 1.11 for a cherry-pick onto - release-1.11). - * You can find the current release team members in the - [appropriate release folder](https://git.k8s.io/sig-release/releases) for the target release. - You may cc them with `<@githubusername>` on your cherry-pick PR. - -## Cherry-pick Review - -Cherry-pick pull requests have an additional requirement compared to normal pull -requests. -They must be approved specifically for cherry-pick by Approvers. -The [Branch Manager](https://git.k8s.io/sig-release/release-team/role-handbooks/branch-manager) -or the [Patch Release Team](https://git.k8s.io/sig-release/release-team/role-handbooks/patch-release-manager) -are the final authority on removing the `do-not-merge/cherry-pick-not-approved` -label and triggering a merge into the target branch. - -## Searching for Cherry-picks - -- [A sample search on kubernetes/kubernetes pull requests that are labeled as `cherry-pick-approved`](https://github.com/kubernetes/kubernetes/pulls?q=is%3Aopen+is%3Apr+label%3Acherry-pick-approved) - -- [A sample search on kubernetes/kubernetes pull requests that are labeled as `do-not-merge/cherry-pick-not-approved`](https://github.com/kubernetes/kubernetes/pulls?q=is%3Aopen+is%3Apr+label%3Ado-not-merge%2Fcherry-pick-not-approved) - - -## Troubleshooting Cherry-picks - -Contributors may encounter some of the following difficulties when initiating a cherry-pick. - -- A cherry-pick PR does not apply cleanly against an old release branch. -In that case, you will need to manually fix conflicts. - -- The cherry-pick PR includes code that does not pass CI tests. -In such a case you will have to fetch the auto-generated branch from your fork, amend the problematic commit and force push to the auto-generated branch. -Alternatively, you can create a new PR, which is noisier. +This file is a placeholder to preserve links. Please remove by April 29, 2019 or the release of kubernetes 1.13, whichever comes first.
\ No newline at end of file diff --git a/contributors/devel/development.md b/contributors/devel/development.md index 91c409e7..8c305b14 100644 --- a/contributors/devel/development.md +++ b/contributors/devel/development.md @@ -186,7 +186,7 @@ To check out code to work on, please refer to [this guide](/contributors/guide/g [build/common.sh]: https://git.k8s.io/kubernetes/build/common.sh [e2e-image]: https://git.k8s.io/test-infra/jenkins/e2e-image [etcd-latest]: https://coreos.com/etcd/docs/latest -[etcd-install]: testing.md#install-etcd-dependency +[etcd-install]: sig-testing/testing.md#install-etcd-dependency <!-- https://github.com/coreos/etcd/releases --> [go-workspace]: https://golang.org/doc/code.html#Workspaces [issue]: https://github.com/kubernetes/kubernetes/issues @@ -194,4 +194,4 @@ To check out code to work on, please refer to [this guide](/contributors/guide/g [kubernetes.io]: https://kubernetes.io [mercurial]: http://mercurial.selenic.com/wiki/Download [test-image]: https://git.k8s.io/test-infra/jenkins/test-image -[Build with Bazel]: bazel.md +[Build with Bazel]: sig-testing/bazel.md diff --git a/contributors/devel/e2e-tests.md b/contributors/devel/e2e-tests.md index ba562912..31d589f6 100644 --- a/contributors/devel/e2e-tests.md +++ b/contributors/devel/e2e-tests.md @@ -1,759 +1,3 @@ -# End-to-End Testing in Kubernetes +This file has moved to https://git.k8s.io/community/contributors/devel/sig-testing/e2e-tests.md. -**Table of Contents** - -- [End-to-End Testing in Kubernetes](#end-to-end-testing-in-kubernetes) - - [Overview](#overview) - - [Building Kubernetes and Running the Tests](#building-kubernetes-and-running-the-tests) - - [Cleaning up](#cleaning-up) - - [Advanced testing](#advanced-testing) - - [Extracting a specific version of kubernetes](#extracting-a-specific-version-of-kubernetes) - - [Bringing up a cluster for testing](#bringing-up-a-cluster-for-testing) - - [Federation e2e tests](#federation-e2e-tests) - - [Configuring federation e2e tests](#configuring-federation-e2e-tests) - - [Image Push Repository](#image-push-repository) - - [Build](#build) - - [Deploy federation control plane](#deploy-federation-control-plane) - - [Run the Tests](#run-the-tests) - - [Teardown](#teardown) - - [Shortcuts for test developers](#shortcuts-for-test-developers) - - [Debugging clusters](#debugging-clusters) - - [Local clusters](#local-clusters) - - [Testing against local clusters](#testing-against-local-clusters) - - [Version-skewed and upgrade testing](#version-skewed-and-upgrade-testing) - - [Test jobs naming convention](#test-jobs-naming-convention) - - [Kinds of tests](#kinds-of-tests) - - [Viper configuration and hierarchichal test parameters.](#viper-configuration-and-hierarchichal-test-parameters) - - [Conformance tests](#conformance-tests) - - [Continuous Integration](#continuous-integration) - - [What is CI?](#what-is-ci) - - [What runs in CI?](#what-runs-in-ci) - - [Non-default tests](#non-default-tests) - - [The PR-builder](#the-pr-builder) - - [Adding a test to CI](#adding-a-test-to-ci) - - [Moving a test out of CI](#moving-a-test-out-of-ci) - - [Performance Evaluation](#performance-evaluation) - - [One More Thing](#one-more-thing) - - -## Overview - -End-to-end (e2e) tests for Kubernetes provide a mechanism to test end-to-end -behavior of the system, and is the last signal to ensure end user operations -match developer specifications. Although unit and integration tests provide a -good signal, in a distributed system like Kubernetes it is not uncommon that a -minor change may pass all unit and integration tests, but cause unforeseen -changes at the system level. - -The primary objectives of the e2e tests are to ensure a consistent and reliable -behavior of the kubernetes code base, and to catch hard-to-test bugs before -users do, when unit and integration tests are insufficient. - -The e2e tests in kubernetes are built atop of -[Ginkgo](http://onsi.github.io/ginkgo/) and -[Gomega](http://onsi.github.io/gomega/). There are a host of features that this -Behavior-Driven Development (BDD) testing framework provides, and it is -recommended that the developer read the documentation prior to diving into the - tests. - -The purpose of *this* document is to serve as a primer for developers who are -looking to execute or add tests using a local development environment. - -Before writing new tests or making substantive changes to existing tests, you -should also read [Writing Good e2e Tests](writing-good-e2e-tests.md) - -## Building Kubernetes and Running the Tests - -There are a variety of ways to run e2e tests, but we aim to decrease the number -of ways to run e2e tests to a canonical way: `kubetest`. - -You can install `kubetest` as follows: -```sh -go get -u k8s.io/test-infra/kubetest -``` - -You can run an end-to-end test which will bring up a master and nodes, perform -some tests, and then tear everything down. Make sure you have followed the -getting started steps for your chosen cloud platform (which might involve -changing the --provider flag value to something other than "gce"). - -You can quickly recompile the e2e testing framework via `go install ./test/e2e`. -This will not do anything besides allow you to verify that the go code compiles. -If you want to run your e2e testing framework without re-provisioning the e2e setup, -you can do so via `make WHAT=test/e2e/e2e.test`, and then re-running the ginkgo tests. - -To build Kubernetes, up a cluster, run tests, and tear everything down, use: - -```sh -kubetest --build --up --test --down -``` - -If you'd like to just perform one of these steps, here are some examples: - -```sh -# Build binaries for testing -kubetest --build - -# Create a fresh cluster. Deletes a cluster first, if it exists -kubetest --up - -# Run all tests -kubetest --test - -# Run tests matching the regex "\[Feature:Performance\]" against a local cluster -# Specify "--provider=local" flag when running the tests locally -kubetest --test --test_args="--ginkgo.focus=\[Feature:Performance\]" --provider=local - -# Conversely, exclude tests that match the regex "Pods.*env" -kubetest --test --test_args="--ginkgo.skip=Pods.*env" - -# Run tests in parallel, skip any that must be run serially -GINKGO_PARALLEL=y kubetest --test --test_args="--ginkgo.skip=\[Serial\]" - -# Run tests in parallel, skip any that must be run serially and keep the test namespace if test failed -GINKGO_PARALLEL=y kubetest --test --test_args="--ginkgo.skip=\[Serial\] --delete-namespace-on-failure=false" - -# Flags can be combined, and their actions will take place in this order: -# --build, --up, --test, --down -# -# You can also specify an alternative provider, such as 'aws' -# -# e.g.: -kubetest --provider=aws --build --up --test --down - -# -ctl can be used to quickly call kubectl against your e2e cluster. Useful for -# cleaning up after a failed test or viewing logs. -# kubectl output is default on, you can use --verbose-commands=false to suppress output. -kubetest -ctl='get events' -kubetest -ctl='delete pod foobar' -``` - -The tests are built into a single binary which can be used to deploy a -Kubernetes system or run tests against an already-deployed Kubernetes system. -See `kubetest --help` (or the flag definitions in `hack/e2e.go`) for -more options, such as reusing an existing cluster. - -### Cleaning up - -During a run, pressing `control-C` should result in an orderly shutdown, but if -something goes wrong and you still have some VMs running you can force a cleanup -with this command: - -```sh -kubetest --down -``` - -## Advanced testing - -### Extracting a specific version of kubernetes - -The `kubetest` binary can download and extract a specific version of kubernetes, -both the server, client and test binaries. The `--extract=E` flag enables this -functionality. - -There are a variety of values to pass this flag: - -```sh -# Official builds: <ci|release>/<latest|stable>[-N.N] -kubetest --extract=ci/latest --up # Deploy the latest ci build. -kubetest --extract=ci/latest-1.5 --up # Deploy the latest 1.5 CI build. -kubetest --extract=release/latest --up # Deploy the latest RC. -kubetest --extract=release/stable-1.5 --up # Deploy the 1.5 release. - -# A specific version: -kubetest --extract=v1.5.1 --up # Deploy 1.5.1 -kubetest --extract=v1.5.2-beta.0 --up # Deploy 1.5.2-beta.0 -kubetest --extract=gs://foo/bar --up # --stage=gs://foo/bar - -# Whatever GKE is using (gke, gke-staging, gke-test): -kubetest --extract=gke --up # Deploy whatever GKE prod uses - -# Using a GCI version: -kubetest --extract=gci/gci-canary --up # Deploy the version for next gci release -kubetest --extract=gci/gci-57 # Deploy the version bound to gci m57 -kubetest --extract=gci/gci-57/ci/latest # Deploy the latest CI build using gci m57 for the VM image - -# Reuse whatever is already built -kubetest --up # Most common. Note, no extract flag -kubetest --build --up # Most common. Note, no extract flag -kubetest --build --stage=gs://foo/bar --extract=local --up # Extract the staged version -``` - -### Bringing up a cluster for testing - -If you want, you may bring up a cluster in some other manner and run tests -against it. To do so, or to do other non-standard test things, you can pass -arguments into Ginkgo using `--test_args` (e.g. see above). For the purposes of -brevity, we will look at a subset of the options, which are listed below: - -``` ---ginkgo.dryRun=false: If set, ginkgo will walk the test hierarchy without -actually running anything. - ---ginkgo.failFast=false: If set, ginkgo will stop running a test suite after a -failure occurs. - ---ginkgo.failOnPending=false: If set, ginkgo will mark the test suite as failed -if any specs are pending. - ---ginkgo.focus="": If set, ginkgo will only run specs that match this regular -expression. - ---ginkgo.noColor="n": If set to "y", ginkgo will not use color in the output - ---ginkgo.skip="": If set, ginkgo will only run specs that do not match this -regular expression. - ---ginkgo.trace=false: If set, default reporter prints out the full stack trace -when a failure occurs - ---ginkgo.v=false: If set, default reporter print out all specs as they begin. - ---host="": The host, or api-server, to connect to - ---kubeconfig="": Path to kubeconfig containing embedded authinfo. - ---provider="": The name of the Kubernetes provider (gce, gke, local, vagrant, -etc.) - ---repo-root="../../": Root directory of kubernetes repository, for finding test -files. -``` - -Prior to running the tests, you may want to first create a simple auth file in -your home directory, e.g. `$HOME/.kube/config`, with the following: - -``` -{ - "User": "root", - "Password": "" -} -``` - -As mentioned earlier there are a host of other options that are available, but -they are left to the developer. - -**NOTE:** If you are running tests on a local cluster repeatedly, you may need -to periodically perform some manual cleanup: - - - `rm -rf /var/run/kubernetes`, clear kube generated credentials, sometimes -stale permissions can cause problems. - - - `sudo iptables -F`, clear ip tables rules left by the kube-proxy. - -### Reproducing failures in flaky tests -You can run a test repeatedly until it fails. This is useful when debugging -flaky tests. In order to do so, you need to set the following environment -variable: -```sh -$ export GINKGO_UNTIL_IT_FAILS=true -``` - -After setting the environment variable, you can run the tests as before. The e2e -script adds `--untilItFails=true` to ginkgo args if the environment variable is -set. The flags asks ginkgo to run the test repeatedly until it fails. - -### Federation e2e tests - -By default, `e2e.go` provisions a single Kubernetes cluster, and any `Feature:Federation` ginkgo tests will be skipped. - -Federation e2e testing involve bringing up multiple "underlying" Kubernetes clusters, -and deploying the federation control plane as a Kubernetes application on the underlying clusters. - -The federation e2e tests are still managed via `e2e.go`, but require some extra configuration items. - -#### Configuring federation e2e tests - -The following environment variables will enable federation e2e building, provisioning and testing. - -```sh -$ export FEDERATION=true -$ export E2E_ZONES="us-central1-a us-central1-b us-central1-f" -``` - -A Kubernetes cluster will be provisioned in each zone listed in `E2E_ZONES`. A zone can only appear once in the `E2E_ZONES` list. - -#### Image Push Repository - -Next, specify the docker repository where your ci images will be pushed. - -* **If `--provider=gce` or `--provider=gke`**: - - If you use the same GCP project where you to run the e2e tests as the container image repository, - FEDERATION_PUSH_REPO_BASE environment variable will be defaulted to "gcr.io/${DEFAULT_GCP_PROJECT_NAME}". - You can skip ahead to the **Build** section. - - You can simply set your push repo base based on your project name, and the necessary repositories will be - auto-created when you first push your container images. - - ```sh - $ export FEDERATION_PUSH_REPO_BASE="gcr.io/${GCE_PROJECT_NAME}" - ``` - - Skip ahead to the **Build** section. - -* **For all other providers**: - - You'll be responsible for creating and managing access to the repositories manually. - - ```sh - $ export FEDERATION_PUSH_REPO_BASE="quay.io/colin_hom" - ``` - - Given this example, the `federation-apiserver` container image will be pushed to the repository - `quay.io/colin_hom/federation-apiserver`. - - The docker client on the machine running `e2e.go` must have push access for the following pre-existing repositories: - - * `${FEDERATION_PUSH_REPO_BASE}/federation-apiserver` - * `${FEDERATION_PUSH_REPO_BASE}/federation-controller-manager` - - These repositories must allow public read access, as the e2e node docker daemons will not have any credentials. If you're using - GCE/GKE as your provider, the repositories will have read-access by default. - -#### Build - -* Compile the binaries and build container images: - - ```sh - $ KUBE_RELEASE_RUN_TESTS=n KUBE_FASTBUILD=true kubetest -build - ``` - -* Push the federation container images - - ```sh - $ federation/develop/push-federation-images.sh - ``` - -#### Deploy federation control plane - -The following command will create the underlying Kubernetes clusters in each of `E2E_ZONES`, and then provision the -federation control plane in the cluster occupying the last zone in the `E2E_ZONES` list. - -```sh -$ kubetest --up -``` - -#### Run the Tests - -This will run only the `Feature:Federation` e2e tests. You can omit the `ginkgo.focus` argument to run the entire e2e suite. - -```sh -$ kubetest --test --test_args="--ginkgo.focus=\[Feature:Federation\]" -``` - -#### Teardown - -```sh -$ kubetest --down -``` - -#### Shortcuts for test developers - -* To speed up `--up`, provision a single-node kubernetes cluster in a single e2e zone: - - `NUM_NODES=1 E2E_ZONES="us-central1-f"` - - Keep in mind that some tests may require multiple underlying clusters and/or minimum compute resource availability. - -* If you're hacking around with the federation control plane deployment itself, - you can quickly re-deploy the federation control plane Kubernetes manifests without tearing any resources down. - To re-deploy the federation control plane after running `--up` for the first time: - - ```sh - $ federation/cluster/federation-up.sh - ``` - -### Debugging clusters - -If a cluster fails to initialize, or you'd like to better understand cluster -state to debug a failed e2e test, you can use the `cluster/log-dump.sh` script -to gather logs. - -This script requires that the cluster provider supports ssh. Assuming it does, -running: - -```sh -$ federation/cluster/log-dump.sh <directory> -``` - -will ssh to the master and all nodes and download a variety of useful logs to -the provided directory (which should already exist). - -The Google-run Jenkins builds automatically collected these logs for every -build, saving them in the `artifacts` directory uploaded to GCS. - -### Local clusters - -It can be much faster to iterate on a local cluster instead of a cloud-based -one. To start a local cluster, you can run: - -```sh -# The PATH construction is needed because PATH is one of the special-cased -# environment variables not passed by sudo -E -sudo PATH=$PATH hack/local-up-cluster.sh -``` - -This will start a single-node Kubernetes cluster than runs pods using the local -docker daemon. Press Control-C to stop the cluster. - -You can generate a valid kubeconfig file by following instructions printed at the -end of aforementioned script. - -#### Testing against local clusters - -In order to run an E2E test against a locally running cluster, first make sure -to have a local build of the tests: - -```sh -kubetest --build -``` - -Then point the tests at a custom host directly: - -```sh -export KUBECONFIG=/path/to/kubeconfig -kubetest --provider=local --test -``` - -To control the tests that are run: - -```sh -kubetest --provider=local --test --test_args="--ginkgo.focus=Secrets" -``` - -You will also likely need to specify `minStartupPods` to match the number of -nodes in your cluster. If you're testing against a cluster set up by -`local-up-cluster.sh`, you will need to do the following: - -```sh -kubetest --provider=local --test --test_args="--minStartupPods=1 --ginkgo.focus=Secrets" -``` - -### Version-skewed and upgrade testing - -We run version-skewed tests to check that newer versions of Kubernetes work -similarly enough to older versions. The general strategy is to cover the following cases: - -1. One version of `kubectl` with another version of the cluster and tests (e.g. - that v1.2 and v1.4 `kubectl` doesn't break v1.3 tests running against a v1.3 - cluster). -1. A newer version of the Kubernetes master with older nodes and tests (e.g. - that upgrading a master to v1.3 with nodes at v1.2 still passes v1.2 tests). -1. A newer version of the whole cluster with older tests (e.g. that a cluster - upgraded---master and nodes---to v1.3 still passes v1.2 tests). -1. That an upgraded cluster functions the same as a brand-new cluster of the - same version (e.g. a cluster upgraded to v1.3 passes the same v1.3 tests as - a newly-created v1.3 cluster). - -[kubetest](https://git.k8s.io/test-infra/kubetest) is -the authoritative source on how to run version-skewed tests, but below is a -quick-and-dirty tutorial. - -```sh -# Assume you have two copies of the Kubernetes repository checked out, at -# ./kubernetes and ./kubernetes_old - -# If using GKE: -export CLUSTER_API_VERSION=${OLD_VERSION} - -# Deploy a cluster at the old version; see above for more details -cd ./kubernetes_old -kubetest --up - -# Upgrade the cluster to the new version -# -# If using GKE, add --upgrade-target=${NEW_VERSION} -# -# You can target Feature:MasterUpgrade or Feature:ClusterUpgrade -cd ../kubernetes -kubetest --provider=gke --test --check-version-skew=false --test_args="--ginkgo.focus=\[Feature:MasterUpgrade\]" - -# Run old tests with new kubectl -cd ../kubernetes_old -kubetest --provider=gke --test --test_args="--kubectl-path=$(pwd)/../kubernetes/cluster/kubectl.sh" -``` - -If you are just testing version-skew, you may want to just deploy at one -version and then test at another version, instead of going through the whole -upgrade process: - -```sh -# With the same setup as above - -# Deploy a cluster at the new version -cd ./kubernetes -kubetest --up - -# Run new tests with old kubectl -kubetest --test --test_args="--kubectl-path=$(pwd)/../kubernetes_old/cluster/kubectl.sh" - -# Run old tests with new kubectl -cd ../kubernetes_old -kubetest --test --test_args="--kubectl-path=$(pwd)/../kubernetes/cluster/kubectl.sh" -``` - -#### Test jobs naming convention - -**Version skew tests** are named as -`<cloud-provider>-<master&node-version>-<kubectl-version>-<image-name>-kubectl-skew` -e.g: `gke-1.5-1.6-cvm-kubectl-skew` means cloud provider is GKE; -master and nodes are built from `release-1.5` branch; -`kubectl` is built from `release-1.6` branch; -image name is cvm (container_vm). -The test suite is always the older one in version skew tests. e.g. from release-1.5 in this case. - -**Upgrade tests**: - -If a test job name ends with `upgrade-cluster`, it means we first upgrade -the cluster (i.e. master and nodes) and then run the old test suite with new kubectl. - -If a test job name ends with `upgrade-cluster-new`, it means we first upgrade -the cluster (i.e. master and nodes) and then run the new test suite with new kubectl. - -If a test job name ends with `upgrade-master`, it means we first upgrade -the master and keep the nodes in old version and then run the old test suite with new kubectl. - -There are some examples in the table, -where `->` means upgrading; container_vm (cvm) and gci are image names. - -| test name | test suite | master version (image) | node version (image) | kubectl -| --------- | :--------: | :----: | :---:| :---: -| gce-1.5-1.6-upgrade-cluster | 1.5 | 1.5->1.6 | 1.5->1.6 | 1.6 -| gce-1.5-1.6-upgrade-cluster-new | 1.6 | 1.5->1.6 | 1.5->1.6 | 1.6 -| gce-1.5-1.6-upgrade-master | 1.5 | 1.5->1.6 | 1.5 | 1.6 -| gke-container_vm-1.5-container_vm-1.6-upgrade-cluster | 1.5 | 1.5->1.6 (cvm) | 1.5->1.6 (cvm) | 1.6 -| gke-gci-1.5-container_vm-1.6-upgrade-cluster-new | 1.6 | 1.5->1.6 (gci) | 1.5->1.6 (cvm) | 1.6 -| gke-gci-1.5-container_vm-1.6-upgrade-master | 1.5 | 1.5->1.6 (gci) | 1.5 (cvm) | 1.6 - -## Kinds of tests - -We are working on implementing clearer partitioning of our e2e tests to make -running a known set of tests easier (#10548). Tests can be labeled with any of -the following labels, in order of increasing precedence (that is, each label -listed below supersedes the previous ones): - - - If a test has no labels, it is expected to run fast (under five minutes), be -able to be run in parallel, and be consistent. - - - `[Slow]`: If a test takes more than five minutes to run (by itself or in -parallel with many other tests), it is labeled `[Slow]`. This partition allows -us to run almost all of our tests quickly in parallel, without waiting for the -stragglers to finish. - - - `[Serial]`: If a test cannot be run in parallel with other tests (e.g. it -takes too many resources or restarts nodes), it is labeled `[Serial]`, and -should be run in serial as part of a separate suite. - - - `[Disruptive]`: If a test restarts components that might cause other tests -to fail or break the cluster completely, it is labeled `[Disruptive]`. Any -`[Disruptive]` test is also assumed to qualify for the `[Serial]` label, but -need not be labeled as both. These tests are not run against soak clusters to -avoid restarting components. - - - `[Flaky]`: If a test is found to be flaky and we have decided that it's too -hard to fix in the short term (e.g. it's going to take a full engineer-week), it -receives the `[Flaky]` label until it is fixed. The `[Flaky]` label should be -used very sparingly, and should be accompanied with a reference to the issue for -de-flaking the test, because while a test remains labeled `[Flaky]`, it is not -monitored closely in CI. `[Flaky]` tests are by default not run, unless a -`focus` or `skip` argument is explicitly given. - - - `[Feature:.+]`: If a test has non-default requirements to run or targets -some non-core functionality, and thus should not be run as part of the standard -suite, it receives a `[Feature:.+]` label, e.g. `[Feature:Performance]` or -`[Feature:Ingress]`. `[Feature:.+]` tests are not run in our core suites, -instead running in custom suites. If a feature is experimental or alpha and is -not enabled by default due to being incomplete or potentially subject to -breaking changes, it does *not* block PR merges, and thus should run in -some separate test suites owned by the feature owner(s) -(see [Continuous Integration](#continuous-integration) below). - - - `[Conformance]`: Designate that this test is included in the Conformance -test suite for [Conformance Testing](sig-architecture/conformance-tests.md). This test must -meet a number of [requirements](sig-architecture/conformance-tests.md#conformance-test-requirements) -to be eligible for this tag. This tag does not supersed any other labels. - - - The following tags are not considered to be exhaustively applied, but are -intended to further categorize existing `[Conformance]` tests, or tests that are -being considered as candidate for promotion to `[Conformance]` as we work to -refine requirements: - - `[Privileged]`: This is a test that requires privileged access - - `[Internet]`: This is a test that assumes access to the public internet - - `[Deprecated]`: This is a test that exercises a deprecated feature - - `[Alpha]`: This is a test that exercises an alpha feature - - `[Beta]`: This is a test that exercises a beta feature - -Every test should be owned by a [SIG](/sig-list.md), -and have a corresponding `[sig-<name>]` label. - -### Viper configuration and hierarchichal test parameters. - -The future of e2e test configuration idioms will be increasingly defined using viper, and decreasingly via flags. - -Flags in general fall apart once tests become sufficiently complicated. So, even if we could use another flag library, it wouldn't be ideal. - -To use viper, rather than flags, to configure your tests: - -- Just add "e2e.json" to the current directory you are in, and define parameters in it... i.e. `"kubeconfig":"/tmp/x"`. - -Note that advanced testing parameters, and hierarchichally defined parameters, are only defined in viper, to see what they are, you can dive into [TestContextType](https://git.k8s.io/kubernetes/test/e2e/framework/test_context.go). - -In time, it is our intent to add or autogenerate a sample viper configuration that includes all e2e parameters, to ship with kubernetes. - -### Conformance tests - -For more information on Conformance tests please see the [Conformance Testing](sig-architecture/conformance-tests.md) - -## Continuous Integration - -A quick overview of how we run e2e CI on Kubernetes. - -### What is CI? - -We run a battery of [release-blocking jobs](https://k8s-testgrid.appspot.com/sig-release-master-blocking) -against `HEAD` of the master branch on a continuous basis, and block merges -via [Tide](https://git.k8s.io/test-infra/prow/cmd/tide) on a subset of those -tests if they fail. - -CI results can be found at [ci-test.k8s.io](http://ci-test.k8s.io), e.g. -[ci-test.k8s.io/kubernetes-e2e-gce/10594](http://ci-test.k8s.io/kubernetes-e2e-gce/10594). - -### What runs in CI? - -We run all default tests (those that aren't marked `[Flaky]` or `[Feature:.+]`) -against GCE and GKE. To minimize the time from regression-to-green-run, we -partition tests across different jobs: - - - `kubernetes-e2e-<provider>` runs all non-`[Slow]`, non-`[Serial]`, -non-`[Disruptive]`, non-`[Flaky]`, non-`[Feature:.+]` tests in parallel. - - - `kubernetes-e2e-<provider>-slow` runs all `[Slow]`, non-`[Serial]`, -non-`[Disruptive]`, non-`[Flaky]`, non-`[Feature:.+]` tests in parallel. - - - `kubernetes-e2e-<provider>-serial` runs all `[Serial]` and `[Disruptive]`, -non-`[Flaky]`, non-`[Feature:.+]` tests in serial. - -We also run non-default tests if the tests exercise general-availability ("GA") -features that require a special environment to run in, e.g. -`kubernetes-e2e-gce-scalability` and `kubernetes-kubemark-gce`, which test for -Kubernetes performance. - -#### Non-default tests - -Many `[Feature:.+]` tests we don't run in CI. These tests are for features that -are experimental (often in the `experimental` API), and aren't enabled by -default. - -### The PR-builder - -We also run a battery of tests against every PR before we merge it. These tests -are equivalent to `kubernetes-gce`: it runs all non-`[Slow]`, non-`[Serial]`, -non-`[Disruptive]`, non-`[Flaky]`, non-`[Feature:.+]` tests in parallel. These -tests are considered "smoke tests" to give a decent signal that the PR doesn't -break most functionality. Results for your PR can be found at -[pr-test.k8s.io](http://pr-test.k8s.io), e.g. -[pr-test.k8s.io/20354](http://pr-test.k8s.io/20354) for #20354. - -### Adding a test to CI - -As mentioned above, prior to adding a new test, it is a good idea to perform a -`-ginkgo.dryRun=true` on the system, in order to see if a behavior is already -being tested, or to determine if it may be possible to augment an existing set -of tests for a specific use case. - -If a behavior does not currently have coverage and a developer wishes to add a -new e2e test, navigate to the ./test/e2e directory and create a new test using -the existing suite as a guide. - -**NOTE:** To build/run with tests in a new directory within ./test/e2e, add the -directory to import list in ./test/e2e/e2e_test.go - -TODO(#20357): Create a self-documented example which has been disabled, but can -be copied to create new tests and outlines the capabilities and libraries used. - -When writing a test, consult #kinds-of-tests above to determine how your test -should be marked, (e.g. `[Slow]`, `[Serial]`; remember, by default we assume a -test can run in parallel with other tests!). - -When first adding a test it should *not* go straight into CI, because failures -block ordinary development. A test should only be added to CI after is has been -running in some non-CI suite long enough to establish a track record showing -that the test does not fail when run against *working* software. Note also that -tests running in CI are generally running on a well-loaded cluster, so must -contend for resources; see above about [kinds of tests](#kinds_of_tests). - -Generally, a feature starts as `experimental`, and will be run in some suite -owned by the team developing the feature. If a feature is in beta or GA, it -*should* block PR merges and releases. In moving from experimental to beta or GA, tests -that are expected to pass by default should simply remove the `[Feature:.+]` -label, and will be incorporated into our core suites. If tests are not expected -to pass by default, (e.g. they require a special environment such as added -quota,) they should remain with the `[Feature:.+]` label. - -Occasionally, we'll want to add tests to better exercise features that are -already GA. These tests also shouldn't go straight to CI. They should begin by -being marked as `[Flaky]` to be run outside of CI, and once a track-record for -them is established, they may be promoted out of `[Flaky]`. - -### Moving a test out of CI - -If we have determined that a test is known-flaky and cannot be fixed in the -short-term, we may move it out of CI indefinitely. This move should be used -sparingly, as it effectively means that we have no coverage of that test. When a -test is demoted, it should be marked `[Flaky]` with a comment accompanying the -label with a reference to an issue opened to fix the test. - -## Performance Evaluation - -Another benefit of the e2e tests is the ability to create reproducible loads on -the system, which can then be used to determine the responsiveness, or analyze -other characteristics of the system. For example, the density tests load the -system to 30,50,100 pods per/node and measures the different characteristics of -the system, such as throughput, api-latency, etc. - -For a good overview of how we analyze performance data, please read the -following [post](https://kubernetes.io/blog/2015/09/kubernetes-performance-measurements-and/) - -For developers who are interested in doing their own performance analysis, we -recommend setting up [prometheus](http://prometheus.io/) for data collection, -and using [grafana](https://prometheus.io/docs/visualization/grafana/) to -visualize the data. There also exists the option of pushing your own metrics in -from the tests using a -[prom-push-gateway](http://prometheus.io/docs/instrumenting/pushing/). -Containers for all of these components can be found -[here](https://hub.docker.com/u/prom/). - -For more accurate measurements, you may wish to set up prometheus external to -kubernetes in an environment where it can access the major system components -(api-server, controller-manager, scheduler). This is especially useful when -attempting to gather metrics in a load-balanced api-server environment, because -all api-servers can be analyzed independently as well as collectively. On -startup, configuration file is passed to prometheus that specifies the endpoints -that prometheus will scrape, as well as the sampling interval. - -``` -#prometheus.conf -job: { - name: "kubernetes" - scrape_interval: "1s" - target_group: { - # apiserver(s) - target: "http://localhost:8080/metrics" - # scheduler - target: "http://localhost:10251/metrics" - # controller-manager - target: "http://localhost:10252/metrics" - } -} -``` - -Once prometheus is scraping the kubernetes endpoints, that data can then be -plotted using promdash, and alerts can be created against the assortment of -metrics that kubernetes provides. - -## One More Thing - -You should also know the [testing conventions](../guide/coding-conventions.md#testing-conventions). - -**HAPPY TESTING!** +This file is a placeholder to preserve links. Please remove by April 30, 2019 or the release of kubernetes 1.13, whichever comes first.
\ No newline at end of file diff --git a/contributors/devel/event-style-guide.md b/contributors/devel/event-style-guide.md index bc4ba22b..52356d36 100644 --- a/contributors/devel/event-style-guide.md +++ b/contributors/devel/event-style-guide.md @@ -1,51 +1,3 @@ -# Event style guide - -Status: During Review - -Author: Marek Grabowski (gmarek@) - -## Why the guide? - -The Event API change proposal is the first step towards having useful Events in the system. Another step is to formalize the Event style guide, i.e. set of properties that developers need to ensure when adding new Events to the system. This is necessary to ensure that we have a system in which all components emit consistently structured Events. - -## When to emit an Event? - -Events are expected to provide important insights for the application developer/operator on the state of their application. Events relevant to cluster administrators are acceptable, as well, though they usually also have the option of looking at component logs. Events are much more expensive than logs, thus they're not expected to provide in-depth system debugging information. Instead concentrate on things that are important from the application developer's perspective. Events need to be either actionable, or be useful to understand past or future system's behavior. Events are not intended to drive automation. Watching resource status should be sufficient for controllers. - -Following are the guidelines for adding Events to the system. Those are not hard-and-fast rules, but should be considered by all contributors adding new Events and members doing reviews. -1. Emit events only when state of the system changes/attempts to change. Events "it's still running" are not interesting. Also, changes that do not add information beyond what is observable by watching the altered resources should not be duplicated as events. Note that adding a reason for some action that can't be inferred from the state change is considered additional information. -1. Limit Events to no more than one per change/attempt. There's no need for Events on "About to do X" AND "Did X"/"Failed to do X". Result is more interesting and implies an attempt. - 1. It may give impression that this gets tricky with scale events, e.g. Deployment scales ReplicaSet which creates/deletes Pods. For us those are 3 (or more) separate Events (3 different objects are affected) so it's fine to emit multiple Events. -1. When an error occurs that prevents a user application from starting or from enacting other normal system behavior, such as object creation, an Event should be emitted (e.g. invalid image). - 1. Note that Events are garbage collected so every user-actionable error needs to be surfaced via resource status as well. - 1. It's usually OK to emit failure Events for each failure. Dedup mechanism will deal with that. The exception is failures that are frequent but typically ephemeral and automatically repairable/recoverable, such as broken socket connections, in which case they should only be reported if persistent and unrepairable, in order to mitigate event spam. -1. When a user application stops running for any reason, an Event should be emitted (e.g. Pod evicted because Node is under memory pressure) -1. If it's a system-wide change of state that may impact currently running applications or have an may have severe impact on future workload schedulability, an Event should be emitted (e.g. Node became unreachable, 1. Failed to create route for Node). -1. If it doesn't fit any of above scenarios you should consider not emitting Event. - -## How to structure an Event? -New Event API tries to use more descriptive field names to influence how Events are structured. Event has following fields: -* Regarding -* Related -* ReportingController -* ReportingInstance -* Action -* Reason -* Type -* Note - -The Event should be structured in a way that following sentence "makes sense": -"Regarding <Event.Regarding>: <Event.Action> <Event.Related> - <Event.Reason>", e.g. -* Regarding Node X: BecameNotReady - NodeUnreachable -* Regarding Pod X: ScheduledOnNode Node Y - <nil> -* Regarding PVC X: BoundToNode Node Y - <nil> -* Regarding Pod X: KilledContainer Container Y - NodeMemoryPressure - -1. ReportingController is a type of a Controller reporting an Event, e.g. k8s.io/node-controller, k8s.io/kubelet. There will be a standard list for controller names for Kubernetes components. Third-party components must namespace themselves in the same manner as label keys. Validation ensures it's a proper qualified name. This shouldn’t be needed in order for users to understand the event, but is provided in case the controller’s logs need to be accessed for further debugging. -1. ReportingInstance is an identifier of the instance of the ReportingController which needs to uniquely identify it. I.e. host name can be used only for controllers that are guaranteed to be unique on the host. This requirement isn't met e.g. for scheduler, so it may need a secondary index. For singleton controllers use Node name (or hostname if controller is not running on the Node). Can have at most 128 alpha-numeric characters. -1. Regarding and Related are ObjectReferences. Regarding should represent the object that's implemented by the ReportingController, Related can contain additional information about another object that takes part in or is affected by the Action (see examples). -1. Action is a low-cardinality (meaning that there's a restricted, predefined set of values allowed) CamelCase string field (i.e. its value has to be determined at compile time) that explains what happened with Regarding/what action did the ReportingController take in Regarding's name. The tuple of {ReportingController, Action, Reason} must be unique, such that a user could look up documentation. Can have at most 128 characters. -1. Reason is a low-cardinality CamelCase string field (i.e. its value has to be determined at compile time) that explains why ReportingController took Action. Can have at most 128 characters. -1. Type can be either "Normal" or "Warning". "Warning" types are reserved for Events that represent a situation that's not expected in a healthy cluster and/or healthy workload: something unexpected and/or undesirable, at least if it occurs frequently enough and/or for a long enough duration. -1. Note can contain an arbitrary, high-cardinality, user readable summary of the Event. This field can lose data if deduplication is triggered. Can have at most 1024 characters. +This file has moved to https://git.k8s.io/community/contributors/devel/sig-instrumentation/event-style-guide.md. +This file is a placeholder to preserve links. Please remove by April 28, 2019 or the release of kubernetes 1.13, whichever comes first.
\ No newline at end of file diff --git a/contributors/devel/flaky-tests.md b/contributors/devel/flaky-tests.md index 14302592..7f238095 100644 --- a/contributors/devel/flaky-tests.md +++ b/contributors/devel/flaky-tests.md @@ -1,201 +1,3 @@ -# Flaky tests - -Any test that fails occasionally is "flaky". Since our merges only proceed when -all tests are green, and we have a number of different CI systems running the -tests in various combinations, even a small percentage of flakes results in a -lot of pain for people waiting for their PRs to merge. - -Therefore, it's very important that we write tests defensively. Situations that -"almost never happen" happen with some regularity when run thousands of times in -resource-constrained environments. Since flakes can often be quite hard to -reproduce while still being common enough to block merges occasionally, it's -additionally important that the test logs be useful for narrowing down exactly -what caused the failure. - -Note that flakes can occur in unit tests, integration tests, or end-to-end -tests, but probably occur most commonly in end-to-end tests. - -## Hunting Flakes - -You may notice lots of your PRs or ones you watch are having a common -pre-submit failure, but less frequent issues that are still of concern take -more analysis over time. There are metrics recorded and viewable in: -- [TestGrid](https://k8s-testgrid.appspot.com/presubmits-kubernetes-blocking#Summary) -- [Velodrome](http://velodrome.k8s.io/dashboard/db/bigquery-metrics?orgId=1) - -It is worth noting tests are going to fail in presubmit a lot due -to unbuildable code, but that wont happen as much on the same commit unless -there's a true issue in the code or a broader problem like a dep failed to -pull in. - -## Filing issues for flaky tests - -Because flakes may be rare, it's very important that all relevant logs be -discoverable from the issue. - -1. Search for the test name. If you find an open issue and you're 90% sure the - flake is exactly the same, add a comment instead of making a new issue. -2. If you make a new issue, you should title it with the test name, prefixed by - "e2e/unit/integration flake:" (whichever is appropriate) -3. Reference any old issues you found in step one. Also, make a comment in the - old issue referencing your new issue, because people monitoring only their - email do not see the backlinks github adds. Alternatively, tag the person or - people who most recently worked on it. -4. Paste, in block quotes, the entire log of the individual failing test, not - just the failure line. -5. Link to durable storage with the rest of the logs. This means (for all the - tests that Google runs) the GCS link is mandatory! The Jenkins test result - link is nice but strictly optional: not only does it expire more quickly, - it's not accessible to non-Googlers. - -## Finding failed flaky test cases - -Find flaky tests issues on GitHub under the [kind/flake issue label][flake]. -There are significant numbers of flaky tests reported on a regular basis and P2 -flakes are under-investigated. Fixing flakes is a quick way to gain expertise -and community goodwill. - -[flake]: https://github.com/kubernetes/kubernetes/issues?q=is%3Aopen+is%3Aissue+label%3Akind%2Fflake - -## Expectations when a flaky test is assigned to you - -Note that we won't randomly assign these issues to you unless you've opted in or -you're part of a group that has opted in. We are more than happy to accept help -from anyone in fixing these, but due to the severity of the problem when merges -are blocked, we need reasonably quick turn-around time on test flakes. Therefore -we have the following guidelines: - -1. If a flaky test is assigned to you, it's more important than anything else - you're doing unless you can get a special dispensation (in which case it will - be reassigned). If you have too many flaky tests assigned to you, or you - have such a dispensation, then it's *still* your responsibility to find new - owners (this may just mean giving stuff back to the relevant Team or SIG Lead). -2. You should make a reasonable effort to reproduce it. Somewhere between an - hour and half a day of concentrated effort is "reasonable". It is perfectly - reasonable to ask for help! -3. If you can reproduce it (or it's obvious from the logs what happened), you - should then be able to fix it, or in the case where someone is clearly more - qualified to fix it, reassign it with very clear instructions. -4. Once you have made a change that you believe fixes a flake, it is conservative - to keep the issue for the flake open and see if it manifests again after the - change is merged. -5. If you can't reproduce a flake: __don't just close it!__ Every time a flake comes - back, at least 2 hours of merge time is wasted. So we need to make monotonic - progress towards narrowing it down every time a flake occurs. If you can't - figure it out from the logs, add log messages that would have help you figure - it out. If you make changes to make a flake more reproducible, please link - your pull request to the flake you're working on. -6. If a flake has been open, could not be reproduced, and has not manifested in - 3 months, it is reasonable to close the flake issue with a note saying - why. - -# Reproducing unit test flakes - -Try the [stress command](https://godoc.org/golang.org/x/tools/cmd/stress). - -Just - -``` -$ go install golang.org/x/tools/cmd/stress -``` - -Then build your test binary - -``` -$ go test -c -race -``` - -Then run it under stress - -``` -$ stress ./package.test -test.run=FlakyTest -``` - -It runs the command and writes output to `/tmp/gostress-*` files when it fails. -It periodically reports with run counts. Be careful with tests that use the -`net/http/httptest` package; they could exhaust the available ports on your -system! - -# Hunting flaky unit tests in Kubernetes - -Sometimes unit tests are flaky. This means that due to (usually) race -conditions, they will occasionally fail, even though most of the time they pass. - -We have a goal of 99.9% flake free tests. This means that there is only one -flake in one thousand runs of a test. - -Running a test 1000 times on your own machine can be tedious and time consuming. -Fortunately, there is a better way to achieve this using Kubernetes. - -_Note: these instructions are mildly hacky for now, as we get run once semantics -and logging they will get better_ - -There is a testing image `brendanburns/flake` up on the docker hub. We will use -this image to test our fix. - -Create a replication controller with the following config: - -```yaml -apiVersion: v1 -kind: ReplicationController -metadata: - name: flakecontroller -spec: - replicas: 24 - template: - metadata: - labels: - name: flake - spec: - containers: - - name: flake - image: brendanburns/flake - env: - - name: TEST_PACKAGE - value: pkg/tools - - name: REPO_SPEC - value: https://github.com/kubernetes/kubernetes -``` - -Note that we omit the labels and the selector fields of the replication -controller, because they will be populated from the labels field of the pod -template by default. - -```sh -kubectl create -f ./controller.yaml -``` - -This will spin up 24 instances of the test. They will run to completion, then -exit, and the kubelet will restart them, accumulating more and more runs of the -test. - -You can examine the recent runs of the test by calling `docker ps -a` and -looking for tasks that exited with non-zero exit codes. Unfortunately, docker -ps -a only keeps around the exit status of the last 15-20 containers with the -same image, so you have to check them frequently. - -You can use this script to automate checking for failures, assuming your cluster -is running on GCE and has four nodes: - -```sh -echo "" > output.txt -for i in {1..4}; do - echo "Checking kubernetes-node-${i}" - echo "kubernetes-node-${i}:" >> output.txt - gcloud compute ssh "kubernetes-node-${i}" --command="sudo docker ps -a" >> output.txt -done -grep "Exited ([^0])" output.txt -``` - -Eventually you will have sufficient runs for your purposes. At that point you -can delete the replication controller by running: - -```sh -kubectl delete replicationcontroller flakecontroller -``` - -If you do a final check for flakes with `docker ps -a`, ignore tasks that -exited -1, since that's what happens when you stop the replication controller. - -Happy flake hunting! +This file has moved to https://git.k8s.io/community/contributors/devel/sig-testing/flaky-tests.md. +This file is a placeholder to preserve links. Please remove by April 30, 2019 or the release of kubernetes 1.13, whichever comes first.
\ No newline at end of file diff --git a/contributors/devel/flexvolume.md b/contributors/devel/flexvolume.md index 12c46382..36fe837d 100644 --- a/contributors/devel/flexvolume.md +++ b/contributors/devel/flexvolume.md @@ -1,155 +1,3 @@ -# Flexvolume +This file has moved to https://git.k8s.io/community/contributors/devel/sig-storage/flexvolume.md. -Flexvolume enables users to write their own drivers and add support for their volumes in Kubernetes. Vendor drivers should be installed in the volume plugin path on every node, and on master if the driver requires attach capability (unless `--enable-controller-attach-detach` Kubelet option is set to false, but this is highly discouraged because it is a legacy mode of operation). - -Flexvolume is a GA feature from Kubernetes 1.8 release onwards. - -## Prerequisites - -Install the vendor driver on all nodes (also on master nodes if "--enable-controller-attach-detach" Kubelet option is enabled) in the plugin path. Path for installing the plugin: `<plugindir>/<vendor~driver>/<driver>`. The default plugin directory is `/usr/libexec/kubernetes/kubelet-plugins/volume/exec/`. It can be changed in kubelet via the `--volume-plugin-dir` flag, and in controller manager via the `--flex-volume-plugin-dir` flag. - -For example to add a `cifs` driver, by vendor `foo` install the driver at: `/usr/libexec/kubernetes/kubelet-plugins/volume/exec/foo~cifs/cifs` - -The vendor and driver names must match flexVolume.driver in the volume spec, with '~' replaced with '/'. For example, if `flexVolume.driver` is set to `foo/cifs`, then the vendor is `foo`, and driver is `cifs`. - -## Dynamic Plugin Discovery -Beginning in v1.8, Flexvolume supports the ability to detect drivers on the fly. Instead of requiring drivers to exist at system initialization time or having to restart kubelet or controller manager, drivers can be installed, upgraded/downgraded, and uninstalled while the system is running. -For more information, please refer to the [design document](/contributors/design-proposals/storage/flexvolume-deployment.md). - -## Automated Plugin Installation/Upgrade -One possible way to install and upgrade your Flexvolume drivers is by using a DaemonSet. See [Recommended Driver Deployment Method](/contributors/design-proposals/storage/flexvolume-deployment.md#recommended-driver-deployment-method) for details, and see [here](https://git.k8s.io/examples/staging/volumes/flexvolume/deploy/) for an example. - -## Plugin details -The plugin expects the following call-outs are implemented for the backend drivers. Some call-outs are optional. Call-outs are invoked from Kubelet and Controller Manager. - -### Driver invocation model: - -#### Init: -Initializes the driver. Called during Kubelet & Controller manager initialization. On success, the function returns a capabilities map showing whether each Flexvolume capability is supported by the driver. -Current capabilities: -* `attach` - a boolean field indicating whether the driver requires attach and detach operations. This field is *required*, although for backward-compatibility the default value is set to `true`, i.e. requires attach and detach. -See [Driver output](#driver-output) for the capabilities map format. -``` -<driver executable> init -``` - -#### Attach: -Attach the volume specified by the given spec on the given node. On success, returns the device path where the device is attached on the node. Called from Controller Manager. - -This call-out does not pass "secrets" specified in Flexvolume spec. If your driver requires secrets, do not implement this call-out and instead use "mount" call-out and implement attach and mount in that call-out. - -``` -<driver executable> attach <json options> <node name> -``` - -#### Detach: -Detach the volume from the node. Called from Controller Manager. -``` -<driver executable> detach <mount device> <node name> -``` - -#### Wait for attach: -Wait for the volume to be attached on the remote node. On success, the path to the device is returned. Called from Controller Manager. The timeout should be 10m (based on https://git.k8s.io/kubernetes/pkg/kubelet/volumemanager/volume_manager.go#L88 ) - -``` -<driver executable> waitforattach <mount device> <json options> -``` - -#### Volume is Attached: -Check the volume is attached on the node. Called from Controller Manager. - -``` -<driver executable> isattached <json options> <node name> -``` - -#### Mount device: -Mount device mounts the device to a global path which individual pods can then bind mount. Called only from Kubelet. - -This call-out does not pass "secrets" specified in Flexvolume spec. If your driver requires secrets, do not implement this call-out and instead use "mount" call-out and implement attach and mount in that call-out. - -``` -<driver executable> mountdevice <mount dir> <mount device> <json options> -``` - -#### Unmount device: -Unmounts the global mount for the device. This is called once all bind mounts have been unmounted. Called only from Kubelet. - -``` -<driver executable> unmountdevice <mount device> -``` -In addition to the user-specified options and [default JSON options](#default-json-options), the following options capturing information about the pod are passed through and generated automatically. - -``` -kubernetes.io/pod.name -kubernetes.io/pod.namespace -kubernetes.io/pod.uid -kubernetes.io/serviceAccount.name -``` - -#### Mount: -Mount the volume at the mount dir. This call-out defaults to bind mount for drivers which implement attach & mount-device call-outs. Called only from Kubelet. - -``` -<driver executable> mount <mount dir> <json options> -``` - -#### Unmount: -Unmount the volume. This call-out defaults to bind mount for drivers which implement attach & mount-device call-outs. Called only from Kubelet. - -``` -<driver executable> unmount <mount dir> -``` - -See [lvm] & [nfs] for a quick example on how to write a simple flexvolume driver. - -### Driver output: - -Flexvolume expects the driver to reply with the status of the operation in the -following format. - -``` -{ - "status": "<Success/Failure/Not supported>", - "message": "<Reason for success/failure>", - "device": "<Path to the device attached. This field is valid only for attach & waitforattach call-outs>" - "volumeName": "<Cluster wide unique name of the volume. Valid only for getvolumename call-out>" - "attached": <True/False (Return true if volume is attached on the node. Valid only for isattached call-out)> - "capabilities": <Only included as part of the Init response> - { - "attach": <True/False (Return true if the driver implements attach and detach)> - } -} -``` - -### Default Json options - -In addition to the flags specified by the user in the Options field of the FlexVolumeSource, the following flags (set through their corresponding FlexVolumeSource fields) are also passed to the executable. -Note: Secrets are passed only to "mount/unmount" call-outs. - -``` -"kubernetes.io/fsType":"<FS type>", -"kubernetes.io/readwrite":"<rw>", -"kubernetes.io/fsGroup":"<FS group>", -"kubernetes.io/mountsDir":"<string>", -"kubernetes.io/pvOrVolumeName":"<Volume name if the volume is in-line in the pod spec; PV name if the volume is a PV>" - -"kubernetes.io/pod.name":"<string>", -"kubernetes.io/pod.namespace":"<string>", -"kubernetes.io/pod.uid":"<string>", -"kubernetes.io/serviceAccount.name":"<string>", - -"kubernetes.io/secret/key1":"<secret1>" -... -"kubernetes.io/secret/keyN":"<secretN>" -``` - -### Example of Flexvolume - -Please refer to the [Flexvolume example directory]. See [nginx-lvm.yaml] & [nginx-nfs.yaml] for a quick example on how to use Flexvolume in a pod. - - -[lvm]: https://git.k8s.io/examples/staging/volumes/flexvolume/lvm -[nfs]: https://git.k8s.io/examples/staging/volumes/flexvolume/nfs -[nginx-lvm.yaml]: https://git.k8s.io/examples/staging/volumes/flexvolume/nginx-lvm.yaml -[nginx-nfs.yaml]: https://git.k8s.io/examples/staging/volumes/flexvolume/nginx-nfs.yaml -[Flexvolume example directory]: https://git.k8s.io/examples/staging/volumes/flexvolume/ +This file is a placeholder to preserve links. Please remove by April 29, 2019 or the release of kubernetes 1.13, whichever comes first.
\ No newline at end of file diff --git a/contributors/devel/getting-builds.md b/contributors/devel/getting-builds.md index 0ae7031b..3e35fe73 100644 --- a/contributors/devel/getting-builds.md +++ b/contributors/devel/getting-builds.md @@ -1,48 +1,3 @@ -# Getting Kubernetes Builds +This file has moved to https://git.k8s.io/community/contributors/devel/sig-release/getting-builds.md. -You can use [hack/get-build.sh](http://releases.k8s.io/HEAD/hack/get-build.sh) -to get a build or to use as a reference on how to get the most recent builds -with curl. With `get-build.sh` you can grab the most recent stable build, the -most recent release candidate, or the most recent build to pass our ci and gce -e2e tests (essentially a nightly build). - -Run `./hack/get-build.sh -h` for its usage. - -To get a build at a specific version (v1.1.1) use: - -```console -./hack/get-build.sh v1.1.1 -``` - -To get the latest stable release: - -```console -./hack/get-build.sh release/stable -``` - -Use the "-v" option to print the version number of a build without retrieving -it. For example, the following prints the version number for the latest ci -build: - -```console -./hack/get-build.sh -v ci/latest -``` - -You can also use the gsutil tool to explore the Google Cloud Storage release -buckets. Here are some examples: - -```sh -gsutil cat gs://kubernetes-release-dev/ci/latest.txt # output the latest ci version number -gsutil cat gs://kubernetes-release-dev/ci/latest-green.txt # output the latest ci version number that passed gce e2e -gsutil ls gs://kubernetes-release-dev/ci/v0.20.0-29-g29a55cc/ # list the contents of a ci release -gsutil ls gs://kubernetes-release/release # list all official releases and rcs -``` - -## Install `gsutil` - -Example installation: - -```console -$ curl -sSL https://storage.googleapis.com/pub/gsutil.tar.gz | sudo tar -xz -C /usr/local/src -$ sudo ln -s /usr/local/src/gsutil/gsutil /usr/bin/gsutil -``` +This file is a placeholder to preserve links. Please remove by April 29, 2019 or the release of kubernetes 1.13, whichever comes first.
\ No newline at end of file diff --git a/contributors/devel/gubernator.md b/contributors/devel/gubernator.md index b03d11a1..34cb58fb 100644 --- a/contributors/devel/gubernator.md +++ b/contributors/devel/gubernator.md @@ -1,136 +1,3 @@ -# Gubernator +This file has moved to https://git.k8s.io/community/contributors/devel/sig-testing/gubernator.md. -*This document is oriented at developers who want to use Gubernator to debug while developing for Kubernetes.* - - -- [Gubernator](#gubernator) - - [What is Gubernator?](#what-is-gubernator) - - [Gubernator Features](#gubernator-features) - - [Test Failures list](#test-failures-list) - - [Log Filtering](#log-filtering) - - [Gubernator for Local Tests](#gubernator-for-local-tests) - - [Future Work](#future-work) - - -## What is Gubernator? - -[Gubernator](https://k8s-gubernator.appspot.com/) is a webpage for viewing and filtering Kubernetes -test results. - -Gubernator simplifies the debugging process and makes it easier to track down failures by automating many -steps commonly taken in searching through logs, and by offering tools to filter through logs to find relevant lines. -Gubernator automates the steps of finding the failed tests, displaying relevant logs, and determining the -failed pods and the corresponding pod UID, namespace, and container ID. -It also allows for filtering of the log files to display relevant lines based on selected keywords, and -allows for multiple logs to be woven together by timestamp. - -Gubernator runs on Google App Engine and fetches logs stored on Google Cloud Storage. - -## Gubernator Features - -### Test Failures list - -Comments made by k8s-ci-robot will post a link to a page listing the failed tests. -Each failed test comes with the corresponding error log from a junit file and a link -to filter logs for that test. - -Based on the message logged in the junit file, the pod name may be displayed. - - - -[Test Failures List Example](https://k8s-gubernator.appspot.com/build/kubernetes-jenkins/logs/kubernetes-e2e-gke/11721) - -### Log Filtering - -The log filtering page comes with checkboxes and textboxes to aid in filtering. Filtered keywords will be bolded -and lines including keywords will be highlighted. Up to four lines around the line of interest will also be displayed. - - - -If less than 100 lines are skipped, the "... skipping xx lines ..." message can be clicked to expand and show -the hidden lines. - -Before expansion: - -After expansion: - - -If the pod name was displayed in the Test Failures list, it will automatically be included in the filters. -If it is not found in the error message, it can be manually entered into the textbox. Once a pod name -is entered, the Pod UID, Namespace, and ContainerID may be automatically filled in as well. These can be -altered as well. To apply the filter, check off the options corresponding to the filter. - - - -To add a filter, type the term to be filtered into the textbox labeled "Add filter:" and press enter. -Additional filters will be displayed as checkboxes under the textbox. - - - -To choose which logs to view check off the checkboxes corresponding to the logs of interest. If multiple logs are -included, the "Weave by timestamp" option can weave the selected logs together based on the timestamp in each line. - - - -[Log Filtering Example 1](https://k8s-gubernator.appspot.com/build/kubernetes-jenkins/logs/kubelet-gce-e2e-ci/5535/nodelog?pod=pod-configmaps-b5b876cb-3e1e-11e6-8956-42010af0001d&junit=junit_03.xml&wrap=on&logfiles=%2Fkubernetes-jenkins%2Flogs%2Fkubelet-gce-e2e-ci%2F5535%2Fartifacts%2Ftmp-node-e2e-7a5a3b40-e2e-node-coreos-stable20160622-image%2Fkube-apiserver.log&logfiles=%2Fkubernetes-jenkins%2Flogs%2Fkubelet-gce-e2e-ci%2F5535%2Fartifacts%2Ftmp-node-e2e-7a5a3b40-e2e-node-coreos-stable20160622-image%2Fkubelet.log&UID=on&poduid=b5b8a59e-3e1e-11e6-b358-42010af0001d&ns=e2e-tests-configmap-oi12h&cID=tmp-node-e2e-7a5a3b40-e2e-node-coreos-stable20160622-image) - -[Log Filtering Example 2](https://k8s-gubernator.appspot.com/build/kubernetes-jenkins/logs/kubernetes-e2e-gke/11721/nodelog?pod=client-containers-a53f813c-503e-11e6-88dd-0242ac110003&junit=junit_19.xml&wrap=on) - - -### Gubernator for Local Tests - -*Currently Gubernator can only be used with remote node e2e tests.* - -**NOTE: Using Gubernator with local tests will publicly upload your test logs to Google Cloud Storage** - -To use Gubernator to view logs from local test runs, set the GUBERNATOR tag to true. -A URL link to view the test results will be printed to the console. -Please note that running with the Gubernator tag will bypass the user confirmation for uploading to GCS. - -```console - -$ make test-e2e-node REMOTE=true GUBERNATOR=true -... -================================================================ -Running gubernator.sh - -Gubernator linked below: -k8s-gubernator.appspot.com/build/yourusername-g8r-logs/logs/e2e-node/timestamp -``` - -The gubernator.sh script can be run after running a remote node e2e test for the same effect. - -```console -$ ./test/e2e_node/gubernator.sh -Do you want to run gubernator.sh and upload logs publicly to GCS? [y/n]y -... -Gubernator linked below: -k8s-gubernator.appspot.com/build/yourusername-g8r-logs/logs/e2e-node/timestamp -``` - -## Future Work - -Gubernator provides a framework for debugging failures and introduces useful features. -There is still a lot of room for more features and growth to make the debugging process more efficient. - -How to contribute (see https://git.k8s.io/test-infra/gubernator/README.md) - -* Extend GUBERNATOR flag to all local tests - -* More accurate identification of pod name, container ID, etc. - * Change content of logged strings for failures to include more information - * Better regex in Gubernator - -* Automate discovery of more keywords - * Volume Name - * Disk Name - * Pod IP - -* Clickable API objects in the displayed lines in order to add them as filters - -* Construct story of pod's lifetime - * Have concise view of what a pod went through from when pod was started to failure - -* Improve UI - * Have separate folders of logs in rows instead of in one long column - * Improve interface for adding additional features (maybe instead of textbox and checkbox, have chips) +This file is a placeholder to preserve links. Please remove by April 30, 2019 or the release of kubernetes 1.13, whichever comes first.
\ No newline at end of file diff --git a/contributors/devel/instrumentation.md b/contributors/devel/instrumentation.md index b0a11193..110359b2 100644 --- a/contributors/devel/instrumentation.md +++ b/contributors/devel/instrumentation.md @@ -1,215 +1,3 @@ -## Instrumenting Kubernetes - -The following references and outlines general guidelines for metric instrumentation -in Kubernetes components. Components are instrumented using the -[Prometheus Go client library](https://github.com/prometheus/client_golang). For non-Go -components. [Libraries in other languages](https://prometheus.io/docs/instrumenting/clientlibs/) -are available. - -The metrics are exposed via HTTP in the -[Prometheus metric format](https://prometheus.io/docs/instrumenting/exposition_formats/), -which is open and well-understood by a wide range of third party applications and vendors -outside of the Prometheus eco-system. - -The [general instrumentation advice](https://prometheus.io/docs/practices/instrumentation/) -from the Prometheus documentation applies. This document reiterates common pitfalls and some -Kubernetes specific considerations. - -Prometheus metrics are cheap as they have minimal internal memory state. Set and increment -operations are thread safe and take 10-25 nanoseconds (Go & Java). -Thus, instrumentation can and should cover all operationally relevant aspects of an application, -internal and external. - -## Quick Start - -The following describes the basic steps required to add a new metric (in Go). - -1. Import "github.com/prometheus/client_golang/prometheus". - -2. Create a top-level var to define the metric. For this, you have to: - - 1. Pick the type of metric. Use a Gauge for things you want to set to a -particular value, a Counter for things you want to increment, or a Histogram or -Summary for histograms/distributions of values (typically for latency). -Histograms are better if you're going to aggregate the values across jobs, while -summaries are better if you just want the job to give you a useful summary of -the values. - 2. Give the metric a name and description. - 3. Pick whether you want to distinguish different categories of things using -labels on the metric. If so, add "Vec" to the name of the type of metric you -want and add a slice of the label names to the definition. - - [Example](https://github.com/kubernetes/kubernetes/blob/cd3299307d44665564e1a5c77d0daa0286603ff5/pkg/apiserver/apiserver.go#L53) - ```go - requestCounter = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Name: "apiserver_request_count", - Help: "Counter of apiserver requests broken out for each verb, API resource, client, and HTTP response code.", - }, - []string{"verb", "resource", "client", "code"}, - ) - ``` - -3. Register the metric so that prometheus will know to export it. - - [Example](https://github.com/kubernetes/kubernetes/blob/cd3299307d44665564e1a5c77d0daa0286603ff5/pkg/apiserver/apiserver.go#L78) - ```go - func init() { - prometheus.MustRegister(requestCounter) - prometheus.MustRegister(requestLatencies) - prometheus.MustRegister(requestLatenciesSummary) - } - ``` - -4. Use the metric by calling the appropriate method for your metric type (Set, -Inc/Add, or Observe, respectively for Gauge, Counter, or Histogram/Summary), -first calling WithLabelValues if your metric has any labels - - [Example](https://github.com/kubernetes/kubernetes/blob/cd3299307d44665564e1a5c77d0daa0286603ff5/pkg/apiserver/apiserver.go#L87) - ```go - requestCounter.WithLabelValues(*verb, *resource, client, strconv.Itoa(*httpCode)).Inc() - ``` - - -## Instrumentation types - -Components have metrics capturing events and states that are inherent to their -application logic. Examples are request and error counters, request latency -histograms, or internal garbage collection cycles. Those metrics are instrumented -directly in the application code. - -Secondly, there are business logic metrics. Those are not about observed application -behavior but abstract system state, such as desired replicas for a deployment. -They are not directly instrumented but collected from otherwise exposed data. - -In Kubernetes they are generally captured in the [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) -component, which reads them from the API server. -For this types of metric exposition, the -[exporter guidelines](https://prometheus.io/docs/instrumenting/writing_exporters/) -apply additionally. - -## Naming - -Metrics added directly by application or package code should have a unique name. -This avoids collisions of metrics added via dependencies. They also clearly -distinguish metrics collected with different semantics. This is solved through -prefixes: - -``` -<component_name>_<metric> -``` - -For example, suppose the kubelet instrumented its HTTP requests but also uses -an HTTP router providing its own implementation. Both expose metrics on total -http requests. They should be distinguishable as in: - -``` -kubelet_http_requests_total{path=”/some/path”,status=”200”} -routerpkg_http_requests_total{path=”/some/path”,status=”200”,method=”GET”} -``` - -As we can see they expose different labels and thus a naming collision would -not have been possible to resolve even if both metrics counted the exact same -requests. - -Resource objects that occur in names should inherit the spelling that is used -in kubectl, i.e. daemon sets are `daemonset` rather than `daemon_set`. - -## Dimensionality & Cardinality - -Metrics can often replace more expensive logging as they are time-aggregated -over a sampling interval. The [multidimensional data model](https://prometheus.io/docs/concepts/data_model/) -enables deep insights and all metrics should use those label dimensions -where appropriate. - -A common error that often causes performance issues in the ingesting metric -system is considering dimensions that inhibit or eliminate time aggregation -by being too specific. Typically those are user IDs or error messages. -More generally: one should know a comprehensive list of all possible values -for a label at instrumentation time. - -Notable exceptions are exporters like kube-state-metrics, which expose per-pod -or per-deployment metrics, which are theoretically unbound over time as one could -constantly create new ones, with new names. However, they have -a reasonable upper bound for a given size of infrastructure they refer to and -its typical frequency of changes. - -In general, “external” labels like pod or node name do not belong in the -instrumentation itself. They are to be attached to metrics by the collecting -system that has the external knowledge ([blog post](https://www.robustperception.io/target-labels-are-for-life-not-just-for-christmas/)). - -## Normalization - -Metrics should be normalized with respect to their dimensions. They should -expose the minimal set of labels, each of which provides additional information. -Labels that are composed from values of different labels are not desirable. -For example: - -``` -example_metric{pod=”abc”,container=”proxy”,container_long=”abc/proxy”} -``` - -It often seems feasible to add additional meta information about an object -to all metrics about that object, e.g.: - -``` -kube_pod_container_restarts{namespace=...,pod=...,container=...} -``` - -A common use case is wanting to look at such metrics w.r.t to the node the -pod is scheduled on. So it seems convenient to add a “node” label. - -``` -kube_pod_container_restarts{namespace=...,pod=...,container=...,node=...} -``` - -This however only caters to one specific query use case. There are many more -pieces of metadata that could be added, effectively blowing up the instrumentation. -They are also not guaranteed to be stable over time. What if pods at some -point can be live migrated? -Those pieces of information should be normalized into an info-level metric -([blog post](https://www.robustperception.io/exposing-the-software-version-to-prometheus/)), -which is always set to 1. For example: - -``` -kube_pod_info{pod=...,namespace=...,pod_ip=...,host_ip=..,node=..., ...} -``` - -The metric system can later denormalize those along the identifying labels -“pod” and “namespace” labels. This leads to... - -## Resource Referencing - -It is often desirable to correlate different metrics about a common object, -such as a pod. Label dimensions can be used to match up different metrics. -This is most easy if label names and values are following a common pattern. -For metrics exposed by the same application, that often happens naturally. - -For a system composed of several independent, and also pluggable components, -it makes sense to set cross-component standards to allow easy querying in -metric systems without extensive post-processing of data. -In Kubernetes, those are the resource objects such as deployments, -pods, or services and the namespace they belong to. - -The following should be consistently used: - -``` -example_metric_ccc{pod=”example-app-5378923”, namespace=”default”} -``` - -An object is referenced by its unique name in a label named after the resource -itself (i.e. `pod`/`deployment`/... and not `pod_name`/`deployment_name`) -and the namespace it belongs to in the `namespace` label. - -Note: namespace/name combinations are only unique at a certain point in time. -For time series this is given by the timestamp associated with any data point. -UUIDs are truly unique but not convenient to use in user-facing time series -queries. -They can still be incorporated using an info level metric as described above for -`kube_pod_info`. A query to a metric system selecting by UUID via a the info level -metric could look as follows: - -``` -kube_pod_restarts and on(namespace, pod) kube_pod_info{uuid=”ABC”} -``` +This file has moved to https://git.k8s.io/community/contributors/devel/sig-instrumentation/instrumentation.md. +This file is a placeholder to preserve links. Please remove by April 28, 2019 or the release of kubernetes 1.13, whichever comes first.
\ No newline at end of file diff --git a/contributors/devel/kubemark-guide.md b/contributors/devel/kubemark-guide.md index ce5727e8..a92b19f9 100644 --- a/contributors/devel/kubemark-guide.md +++ b/contributors/devel/kubemark-guide.md @@ -1,256 +1,3 @@ -# Kubemark User Guide - -## Introduction - -Kubemark is a performance testing tool which allows users to run experiments on -simulated clusters. The primary use case is scalability testing, as simulated -clusters can be much bigger than the real ones. The objective is to expose -problems with the master components (API server, controller manager or -scheduler) that appear only on bigger clusters (e.g. small memory leaks). - -This document serves as a primer to understand what Kubemark is, what it is not, -and how to use it. - -## Architecture - -On a very high level, Kubemark cluster consists of two parts: a real master -and a set of “Hollow” Nodes. The prefix “Hollow” to any component means an -implementation/instantiation of the actual component with all “moving” -parts mocked out. The best example is HollowKubelet, which pretends to be an -ordinary Kubelet, but does not start anything, nor mount any volumes - it just -lies it does. More detailed design and implementation details are at the end -of this document. - -Currently, master components run on a dedicated machine as pods that are -created/managed by kubelet, which itself runs as either a systemd or a supervisord -service on the master VM depending on the VM distro (though currently it is -only systemd as we use a GCI image). Having a dedicated machine for the master -has a slight advantage over running the master components on an external cluster, -which is being able to completely isolate master resources from everything else. -The HollowNodes on the other hand are run on an ‘external’ Kubernetes cluster -as pods in an isolated namespace (named kubemark). This idea of using pods on a -real cluster behave (or act) as nodes on the kubemark cluster lies at the heart of -kubemark's design. - -## Requirements - -To run Kubemark you need a Kubernetes cluster (called `external cluster`) -for running all your HollowNodes and a dedicated machine for a master. -Master machine has to be directly routable from HollowNodes. You also need -access to a Docker repository (which is gcr.io in the case of GCE) that has the -container images for etcd, hollow-node and node-problem-detector. - -Currently, scripts are written to be easily usable by GCE, but it should be -relatively straightforward to port them to different providers or bare metal. -There is an ongoing effort to refactor kubemark code into provider-specific (gce) -and provider-independent code, which should make it relatively simple to run -kubemark clusters on other cloud providers as well. - -## Common use cases and helper scripts - -Common workflow for Kubemark is: -- starting a Kubemark cluster (on GCE) -- running e2e tests on Kubemark cluster -- monitoring test execution and debugging problems -- turning down Kubemark cluster - -(For now) Included in descriptions there will be comments helpful for anyone who’ll -want to port Kubemark to different providers. -(Later) When the refactoring mentioned in the above section finishes, we would replace -these comments with a clean API that would allow kubemark to run on top of any provider. - -### Starting a Kubemark cluster - -To start a Kubemark cluster on GCE you need to create an external kubernetes -cluster (it can be GCE, GKE or anything else) by yourself, make sure that kubeconfig -points to it by default, build a kubernetes release (e.g. by running -`make quick-release`) and run `test/kubemark/start-kubemark.sh` script. -This script will create a VM for master (along with mounted PD and firewall rules set), -then start kubelet and run the pods for the master components. Following this, it -sets up the HollowNodes as Pods on the external cluster and do all the setup necessary -to let them talk to the kubemark apiserver. It will use the configuration stored in -`cluster/kubemark/config-default.sh` - you can tweak it however you want, but note that -some features may not be implemented yet, as implementation of Hollow components/mocks -will probably be lagging behind ‘real’ one. For performance tests interesting variables -are `NUM_NODES` and `KUBEMARK_MASTER_SIZE`. After start-kubemark script is finished, -you’ll have a ready Kubemark cluster, and a kubeconfig file for talking to the Kubemark -cluster is stored in `test/kubemark/resources/kubeconfig.kubemark`. - -Currently we're running HollowNode with a limit of 0.09 CPU core/pod and 220MB of memory. -However, if we also take into account the resources absorbed by default cluster addons -and fluentD running on the 'external' cluster, this limit becomes ~0.1 CPU core/pod, -thus allowing ~10 HollowNodes to run per core (on an "n1-standard-8" VM node). - -#### Behind the scene details: - -start-kubemark.sh script does quite a lot of things: - -- Prepare a master machine named MASTER_NAME (this variable's value should be set by this point): - (*the steps below use gcloud, and should be easy to do outside of GCE*) - 1. Creates a Persistent Disk for use by the master (one more for etcd-events, if flagged) - 2. Creates a static IP address for the master in the cluster and assign it to variable MASTER_IP - 3. Creates a VM instance for the master, configured with the PD and IP created above. - 4. Set firewall rule in the master to open port 443\* for all TCP traffic by default. - -<sub>\* Port 443 is a secured port on the master machine which is used for all -external communication with the API server. In the last sentence *external* -means all traffic coming from other machines, including all the Nodes, not only -from outside of the cluster. Currently local components, i.e. ControllerManager -and Scheduler talk with API server using insecure port 8080.</sub> - -- [Optional to read] Establish necessary certs/keys required for setting up the PKI for kubemark cluster: - (*the steps below are independent of GCE and work for all providers*) - 1. Generate a randomly named temporary directory for storing PKI certs/keys which is delete-trapped on EXIT. - 2. Create a bearer token for 'admin' in master. - 3. Generate certificate for CA and (certificate + private-key) pair for each of master, kubelet and kubecfg. - 4. Generate kubelet and kubeproxy tokens for master. - 5. Write a kubeconfig locally to `test/kubemark/resources/kubeconfig.kubemark` for enabling local kubectl use. - -- Set up environment and start master components (through `start-kubemark-master.sh` script): - (*the steps below use gcloud for SSH and SCP to master, and should be easy to do outside of GCE*) - 1. SSH to the master machine and create a new directory (`/etc/srv/kubernetes`) and write all the - certs/keys/tokens/passwords to it. - 2. SCP all the master pod manifests, shell scripts (`start-kubemark-master.sh`, `configure-kubectl.sh`, etc), - config files for passing env variables (`kubemark-master-env.sh`) from the local machine to the master. - 3. SSH to the master machine and run the startup script `start-kubemark-master.sh` (and possibly others). - - Note: The directory structure and the functions performed by the startup script(s) can vary based on master distro. - We currently support the GCI image `gci-dev-56-8977-0-0` in GCE. - -- Set up and start HollowNodes (as pods) on the external cluster: - (*the steps below (except 2nd and 3rd) are independent of GCE and work for all providers*) - 1. Identify the right kubemark binary from the current kubernetes repo for the platform linux/amd64. - 2. Create a Docker image for HollowNode using this binary and upload it to a remote Docker repository. - (We use gcr.io/ as our remote docker repository in GCE, should be different for other providers) - 3. [One-off] Create and upload a Docker image for NodeProblemDetector (see kubernetes/node-problem-detector repo), - which is one of the containers in the HollowNode pod, besides HollowKubelet and HollowProxy. However we - use it with a hollow config that essentially has an empty set of rules and conditions to be detected. - This step is required only for other cloud providers, as the docker image for GCE already exists on GCR. - 4. Create secret which stores kubeconfig for use by HollowKubelet/HollowProxy, addons, and configMaps - for the HollowNode and the HollowNodeProblemDetector. - 5. Create a ReplicationController for HollowNodes that starts them up, after replacing all variables in - the hollow-node_template.json resource. - 6. Wait until all HollowNodes are in the Running phase. - -### Running e2e tests on Kubemark cluster - -To run standard e2e test on your Kubemark cluster created in the previous step -you execute `test/kubemark/run-e2e-tests.sh` script. It will configure ginkgo to -use Kubemark cluster instead of something else and start an e2e test. This -script should not need any changes to work on other cloud providers. - -By default (if nothing will be passed to it) the script will run a Density '30 -test. If you want to run a different e2e test you just need to provide flags you want to be -passed to `hack/ginkgo-e2e.sh` script, e.g. `--ginkgo.focus="Load"` to run the -Load test. - -By default, at the end of each test, it will delete namespaces and everything -under it (e.g. events, replication controllers) on Kubemark master, which takes -a lot of time. Such work aren't needed in most cases: if you delete your -Kubemark cluster after running `run-e2e-tests.sh`; you don't care about -namespace deletion performance, specifically related to etcd; etc. There is a -flag that enables you to avoid namespace deletion: `--delete-namespace=false`. -Adding the flag should let you see in logs: `Found DeleteNamespace=false, -skipping namespace deletion!` - -### Monitoring test execution and debugging problems - -Run-e2e-tests prints the same output on Kubemark as on ordinary e2e cluster, but -if you need to dig deeper you need to learn how to debug HollowNodes and how -Master machine (currently) differs from the ordinary one. - -If you need to debug master machine you can do similar things as you do on your -ordinary master. The difference between Kubemark setup and ordinary setup is -that in Kubemark etcd is run as a plain docker container, and all master -components are run as normal processes. There's no Kubelet overseeing them. Logs -are stored in exactly the same place, i.e. `/var/logs/` directory. Because -binaries are not supervised by anything they won't be restarted in the case of a -crash. - -To help you with debugging from inside the cluster startup script puts a -`~/configure-kubectl.sh` script on the master. It downloads `gcloud` and -`kubectl` tool and configures kubectl to work on unsecured master port (useful -if there are problems with security). After the script is run you can use -kubectl command from the master machine to play with the cluster. - -Debugging HollowNodes is a bit more tricky, as if you experience a problem on -one of them you need to learn which hollow-node pod corresponds to a given -HollowNode known by the Master. During self-registeration HollowNodes provide -their cluster IPs as Names, which means that if you need to find a HollowNode -named `10.2.4.5` you just need to find a Pod in external cluster with this -cluster IP. There's a helper script -`test/kubemark/get-real-pod-for-hollow-node.sh` that does this for you. - -When you have a Pod name you can use `kubectl logs` on external cluster to get -logs, or use a `kubectl describe pod` call to find an external Node on which -this particular HollowNode is running so you can ssh to it. - -E.g. you want to see the logs of HollowKubelet on which pod `my-pod` is running. -To do so you can execute: - -``` -$ kubectl kubernetes/test/kubemark/resources/kubeconfig.kubemark describe pod my-pod -``` - -Which outputs pod description and among it a line: - -``` -Node: 1.2.3.4/1.2.3.4 -``` - -To learn the `hollow-node` pod corresponding to node `1.2.3.4` you use -aforementioned script: - -``` -$ kubernetes/test/kubemark/get-real-pod-for-hollow-node.sh 1.2.3.4 -``` - -which will output the line: - -``` -hollow-node-1234 -``` - -Now you just use ordinary kubectl command to get the logs: - -``` -kubectl --namespace=kubemark logs hollow-node-1234 -``` - -All those things should work exactly the same on all cloud providers. - -### Turning down Kubemark cluster - -On GCE you just need to execute `test/kubemark/stop-kubemark.sh` script, which -will delete HollowNode ReplicationController and all the resources for you. On -other providers you’ll need to delete all this stuff by yourself. As part of -the effort mentioned above to refactor kubemark into provider-independent and -provider-specific parts, the resource deletion logic specific to the provider -would move out into a clean API. - -## Some current implementation details and future roadmap - -Kubemark master uses exactly the same binaries as ordinary Kubernetes does. This -means that it will never be out of date. On the other hand HollowNodes use -existing fake for Kubelet (called SimpleKubelet), which mocks its runtime -manager with `pkg/kubelet/dockertools/fake_manager.go`, where most logic sits. -Because there's no easy way of mocking other managers (e.g. VolumeManager), they -are not supported in Kubemark (e.g. we can't schedule Pods with volumes in them -yet). - -We currently plan to extend kubemark along the following directions: -- As you would have noticed at places above, we aim to make kubemark more structured - and easy to run across various providers without having to tweak the setup scripts, - using a well-defined kubemark-provider API. -- Allow kubemark to run on various distros (GCI, debian, redhat, etc) for any - given provider. -- Make Kubemark performance on ci-tests mimic real cluster ci-tests on metrics such as - CPU, memory and network bandwidth usage and realizing this goal through measurable - objectives (like the kubemark metric should vary no more than X% with the real - cluster metric). We could also use metrics reported by Prometheus. -- Improve logging of CI-test metrics (such as aggregated API call latencies, scheduling - call latencies, %ile for CPU/mem usage of different master components in density/load - tests) by packing them into well-structured artifacts instead of the (current) dumping - to logs. -- Create a Dashboard that lets easy viewing and comparison of these metrics across tests. +This file has moved to https://git.k8s.io/community/contributors/devel/sig-scalability/kubemark-guide.md. +This file is a placeholder to preserve links. Please remove by April 29, 2019 or the release of kubernetes 1.13, whichever comes first.
\ No newline at end of file diff --git a/contributors/devel/logging.md b/contributors/devel/logging.md index c4da6829..d857bc64 100644 --- a/contributors/devel/logging.md +++ b/contributors/devel/logging.md @@ -1,34 +1,3 @@ -## Logging Conventions +This file has moved to https://git.k8s.io/community/contributors/devel/sig-instrumentation/logging.md. -The following conventions for the klog levels to use. -[klog](http://godoc.org/github.com/kubernetes/klog) is globally preferred to -[log](http://golang.org/pkg/log/) for better runtime control. - -* klog.Errorf() - Always an error - -* klog.Warningf() - Something unexpected, but probably not an error - -* klog.Infof() has multiple levels: - * klog.V(0) - Generally useful for this to ALWAYS be visible to an operator - * Programmer errors - * Logging extra info about a panic - * CLI argument handling - * klog.V(1) - A reasonable default log level if you don't want verbosity. - * Information about config (listening on X, watching Y) - * Errors that repeat frequently that relate to conditions that can be corrected (pod detected as unhealthy) - * klog.V(2) - Useful steady state information about the service and important log messages that may correlate to significant changes in the system. This is the recommended default log level for most systems. - * Logging HTTP requests and their exit code - * System state changing (killing pod) - * Controller state change events (starting pods) - * Scheduler log messages - * klog.V(3) - Extended information about changes - * More info about system state changes - * klog.V(4) - Debug level verbosity - * Logging in particularly thorny parts of code where you may want to come back later and check it - * klog.V(5) - Trace level verbosity - * Context to understand the steps leading up to errors and warnings - * More information for troubleshooting reported issues - -As per the comments, the practical default level is V(2). Developers and QE -environments may wish to run at V(3) or V(4). If you wish to change the log -level, you can pass in `-v=X` where X is the desired maximum level to log. +This file is a placeholder to preserve links. Please remove by April 28, 2019 or the release of kubernetes 1.13, whichever comes first.
\ No newline at end of file diff --git a/contributors/devel/on-call-federation-build-cop.md b/contributors/devel/on-call-federation-build-cop.md deleted file mode 100644 index c153b02a..00000000 --- a/contributors/devel/on-call-federation-build-cop.md +++ /dev/null @@ -1,109 +0,0 @@ -# Federation Buildcop Guide and Playbook - -Federation runs two classes of tests: CI and Pre-submits. - -## CI - -* These tests run on the HEADs of master and release branches (starting - from Kubernetes v1.7). -* As a result, they run on code that's already merged. -* As the name suggests, they run continuously. Currently, they are - configured to run at least once every 30 minutes. -* Federation CI tests run as periodic jobs on prow. -* CI jobs always run sequentially. In other words, no single CI job - can have two instances of the job running at the same time. -* Latest build results can be viewed in [testgrid](https://k8s-testgrid.appspot.com/sig-multicluster) - -### Configuration - -Configuration steps are described in https://github.com/kubernetes/test-infra#create-a-new-job. -Federation CI e2e job names are as below: -* master branch - `ci-federation-e2e-gce` and `ci-federation-e2e-gce-serial` -* 1.8 release branch - `ci-kubernetes-e2e-gce-federation-release-1-8` -* 1.7 release branch - `ci-kubernetes-e2e-gce-federation-release-1-7` - -Search for the above job names in various configuration files as below: - -* Prow config: https://git.k8s.io/test-infra/prow/config.yaml -* Test job/bootstrap config: https://git.k8s.io/test-infra/jobs/config.json -* Test grid config: https://git.k8s.io/test-infra/testgrid/config.yaml -* Job specific config: https://git.k8s.io/test-infra/jobs/env - -### Results - -Results of all the federation CI tests are listed in the corresponding -tabs on the Cluster Federation page in the testgrid. -https://k8s-testgrid.appspot.com/sig-multicluster - -### Playbook - -#### Triggering a new run - -Please ping someone who has access to the prow project and ask -them to click the `rerun` button from, for example -http://prow.k8s.io/?type=periodic&job=ci-federation-e2e-gce, -and execute the kubectl command. - -#### Quota cleanup - -Please ping someone who has access to the GCP project. Ask them to -look at the quotas and delete the leaked resources by clicking the -delete button corresponding to those leaked resources on Google Cloud -Console. - - -## Pre-submit - -* The pre-submit test is currently configured to run on the master - branch and any release branch that's 1.9 or newer. -* Multiple pre-submit jobs could be running in parallel(one per pr). -* Latest build results can be viewed in [testgrid](https://k8s-testgrid.appspot.com/presubmits-federation) -* We have following pre-submit jobs in federation - * bazel-test - Runs all the bazel test targets in federation. - * e2e-gce - Runs federation e2e tests on gce. - * verify - Runs federation unit, integration tests and few verify scripts. - -### Configuration - -Configuration steps are described in https://github.com/kubernetes/test-infra#create-a-new-job. -Federation pre-submit jobs have following names. -* bazel-test - `pull-federation-bazel-test` -* verify - `pull-federation-verify` -* e2e-gce - `pull-federation-e2e-gce` - -Search for the above job names in various configuration files as below: - -* Prow config: https://git.k8s.io/test-infra/prow/config.yaml -* Test job/bootstrap config: https://git.k8s.io/test-infra/jobs/config.json -* Test grid config: https://git.k8s.io/test-infra/testgrid/config.yaml -* Job specific config: https://git.k8s.io/test-infra/jobs/env - -### Results - -Aggregated results are available on the Gubernator dashboard page for -the federation pre-submit tests. - -https://k8s-gubernator.appspot.com/builds/kubernetes-jenkins/pr-logs/directory/pull-federation-e2e-gce - -### Metrics - -We track the flakiness metrics of all the pre-submit jobs and -individual tests that run against PRs in -[kubernetes/federation](https://github.com/kubernetes/federation). - -* The metrics that we track are documented in https://git.k8s.io/test-infra/metrics/README.md#metrics. -* Job-level metrics are available in http://storage.googleapis.com/k8s-metrics/job-flakes-latest.json. - -### Playbook - -#### Triggering a new run - -Use the `/test` command on the PR to re-trigger the test. The exact -incantation is: `/test pull-federation-e2e-gce` - -#### Quota cleanup - -Please ping someone who has access to `k8s-jkns-pr-bldr-e2e-gce-fdrtn` -GCP project. Ask them to look at the quotas and delete the leaked -resources by clicking the delete button corresponding to those leaked -resources on Google Cloud Console. diff --git a/contributors/devel/profiling.md b/contributors/devel/profiling.md index f7c8b2e5..9951eb27 100644 --- a/contributors/devel/profiling.md +++ b/contributors/devel/profiling.md @@ -1,76 +1,3 @@ -# Profiling Kubernetes +This file has moved to https://git.k8s.io/community/contributors/devel/sig-scalability/profiling.md. -This document explain how to plug in profiler and how to profile Kubernetes services. To get familiar with the tools mentioned below, it is strongly recommended to read [Profiling Go Programs](https://blog.golang.org/profiling-go-programs). - -## Profiling library - -Go comes with inbuilt 'net/http/pprof' profiling library and profiling web service. The way service works is binding debug/pprof/ subtree on a running webserver to the profiler. Reading from subpages of debug/pprof returns pprof-formatted profiles of the running binary. The output can be processed offline by the tool of choice, or used as an input to handy 'go tool pprof', which can graphically represent the result. - -## Adding profiling to services to APIserver. - -TL;DR: Add lines: - -```go -m.mux.HandleFunc("/debug/pprof/", pprof.Index) -m.mux.HandleFunc("/debug/pprof/profile", pprof.Profile) -m.mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol) -``` - -to the `init(c *Config)` method in 'pkg/master/master.go' and import 'net/http/pprof' package. - -In most use cases to use profiler service it's enough to do 'import _ net/http/pprof', which automatically registers a handler in the default http.Server. Slight inconvenience is that APIserver uses default server for intra-cluster communication, so plugging profiler to it is not really useful. In 'pkg/kubelet/server/server.go' more servers are created and started as separate goroutines. The one that is usually serving external traffic is secureServer. The handler for this traffic is defined in 'pkg/master/master.go' and stored in Handler variable. It is created from HTTP multiplexer, so the only thing that needs to be done is adding profiler handler functions to this multiplexer. This is exactly what lines after TL;DR do. - -## Connecting to the profiler - -Even when running profiler I found not really straightforward to use 'go tool pprof' with it. The problem is that at least for dev purposes certificates generated for APIserver are not signed by anyone trusted and because secureServer serves only secure traffic it isn't straightforward to connect to the service. The best workaround I found is by creating an ssh tunnel from the kubernetes_master open unsecured port to some external server, and use this server as a proxy. To save everyone looking for correct ssh flags, it is done by running: - -```sh -ssh kubernetes_master -L<local_port>:localhost:8080 -``` - -or analogous one for you Cloud provider. Afterwards you can e.g. run - -```sh -go tool pprof http://localhost:<local_port>/debug/pprof/profile -``` - -to get 30 sec. CPU profile. - -## Contention profiling - -To enable contention profiling you need to add line `rt.SetBlockProfileRate(1)` in addition to `m.mux.HandleFunc(...)` added before (`rt` stands for `runtime` in `master.go`). This enables 'debug/pprof/block' subpage, which can be used as an input to `go tool pprof`. - -## Profiling in tests - -To gather a profile from a test, the HTTP interface is probably not suitable. Instead, you can add the `-cpuprofile` flag to your KUBE_TEST_ARGS, e.g. - -```sh -make test-integration WHAT="./test/integration/scheduler" KUBE_TEST_ARGS="-cpuprofile cpu.out" -go tool pprof cpu.out -``` - -See the ['go test' flags](https://golang.org/cmd/go/#hdr-Description_of_testing_flags) for how to capture other types of profiles. - -## Profiling in a benchmark test - -Gathering a profile from a benchmark test works in the same way as regular tests, but sometimes there may be expensive setup that you want excluded from the profile. (i.e. any time you would use `b.ResetTimer()`) - -To solve this problem, you can explicitly start the profile in your test code like so. - -```go -func BenchmarkMyFeature(b *testing.B) { - // Expensive test setup... - b.ResetTimer() - f, err := os.Create("bench_profile.out") - if err != nil { - log.Fatal("could not create profile file: ", err) - } - if err := pprof.StartCPUProfile(f); err != nil { - log.Fatal("could not start CPU profile: ", err) - } - defer pprof.StopCPUProfile() - // Rest of the test... -} -``` - -> Note: Code added to a test to gather CPU profiles should not be merged. It is meant to be temporary while you create an analyze profiles. +This file is a placeholder to preserve links. Please remove by April 29, 2019 or the release of kubernetes 1.13, whichever comes first.
\ No newline at end of file diff --git a/contributors/devel/release.md b/contributors/devel/release.md index b4e9224e..9ce19241 100644 --- a/contributors/devel/release.md +++ b/contributors/devel/release.md @@ -1,307 +1,3 @@ -# Targeting Features, Issues and PRs to Release Milestones +This file has moved to https://git.k8s.io/community/contributors/devel/sig-release/release.md. -This document is focused on Kubernetes developers and contributors -who need to create a feature, issue, or pull request which targets a specific -release milestone. - -- [TL;DR](#tldr) -- [Definitions](#definitions) -- [The Release Cycle](#the-release-cycle) -- [Removal Of Items From The Milestone](#removal-of-items-from-the-milestone) -- [Adding An Item To The Milestone](#adding-an-item-to-the-milestone) - - [Milestone Maintainers](#milestone-maintainers) - - [Feature additions](#feature-additions) - - [Issue additions](#issue-additions) - - [PR Additions](#pr-additions) -- [Other Required Labels](#other-required-labels) - - [SIG Owner Label](#sig-owner-label) - - [Priority Label](#priority-label) - - [Issue Kind Label](#issue-kind-label) - -The process for shepherding features, issues, and pull requests -into a Kubernetes release spans multiple stakeholders: -* the feature, issue, or pull request owner -* SIG leadership -* the release team - -Information on workflows and interactions are described below. - -As the owner of a feature, issue, or pull request (PR), it is your -responsibility to ensure release milestone requirements are met. -Automation and the release team will be in contact with you if -updates are required, but inaction can result in your work being -removed from the milestone. Additional requirements exist when the -target milestone is a prior release (see [cherry pick -process](cherry-picks.md) for more information). - -## TL;DR - -If you want your PR to get merged, it needs the following required labels and milestones, represented here by the Prow /commands it would take to add them: -<table> -<tr> -<td></td> -<td>Normal Dev</td> -<td>Code Freeze</td> -<td>Post-Release</td> -</tr> -<tr> -<td></td> -<td>Weeks 1-8</td> -<td>Weeks 9-11</td> -<td>Weeks 11+</td> -</tr> -<tr> -<td>Required Labels</td> -<td> -<ul> -<!--Weeks 1-8--> -<li>/sig {name}</li> -<li>/kind {type}</li> -<li>/lgtm</li> -<li>/approved</li> -</ul> -</td> -<td> -<ul> -<!--Weeks 9-11--> -<li>/milestone {v1.y}</li> -<li>/sig {name}</li> -<li>/kind {bug, failing-test}</li> -<li>/priority critical-urgent</li> -<li>/lgtm</li> -<li>/approved</li> -</ul> -</td> -<td> -<!--Weeks 11+--> -Return to 'Normal Dev' phase requirements: -<ul> -<li>/sig {name}</li> -<li>/kind {type}</li> -<li>/lgtm</li> -<li>/approved</li> -</ul> - -Merges into the 1.y branch are now [via cherrypicks](https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md), approved by release branch manager. -</td> -<td> -<ul> -</td> -</tr> -</table> - -In the past there was a requirement for a milestone targeted pull -request to have an associated GitHub issue opened, but this is no -longer the case. Features are effectively GitHub issues or -[KEPs](https://git.k8s.io/community/keps) -which lead to subsequent PRs. The general labeling process should -be consistent across artifact types. - ---- - -## Definitions - -- *issue owners*: Creator, assignees, and user who moved the issue into a release milestone. -- *release team*: Each Kubernetes release has a team doing project - management tasks described - [here](https://git.k8s.io/sig-release/release-team/README.md). The - contact info for the team associated with any given release can be - found [here](https://git.k8s.io/sig-release/releases/). -- *Y days*: Refers to business days (using the location local to the release-manager M-F). -- *feature*: see "[Is My Thing a Feature?](http://git.k8s.io/features/README.md#is-my-thing-a-feature) -- *release milestone*: semantic version string or [GitHub milestone](https://help.github.com/articles/associating-milestones-with-issues-and-pull-requests/) referring to a release MAJOR.MINOR vX.Y version. See also [release versioning](http://git.k8s.io/community/contributors/design-proposals/release/versioning.md) -- *release branch*: Git branch "release-X.Y" created for the vX.Y milestone. Created at the time of the vX.Y-beta.0 release and maintained after the release for approximately 9 months with vX.Y.Z patch releases. - -## The Release Cycle - - - -Kubernetes releases currently happen four times per year. The release -process can be thought of as having three main phases: -* Feature Definition -* Implementation -* Stabilization - -But in reality this is an open source and agile project, with feature -planning and implementation happening at all times. Given the -project scale and globally distributed developer base, it is critical -to project velocity to not rely on a trailing stabilization phase and -rather have continuous integration testing which ensures the -project is always stable so that individual commits can be -flagged as having broken something. - -With ongoing feature definition through the year, some set of items -will bubble up as targeting a given release. The **enhancement freeze** -starts ~4 weeks into release cycle. By this point all intended -feature work for the given release has been defined in suitable -planning artifacts in conjunction with the Release Team's [enhancements -lead](https://git.k8s.io/sig-release/release-team/role-handbooks/enhancements/README.md). - -Implementation and bugfixing is ongoing across the cycle, but -culminates in a code freeze period: -* The **code freeze** starts in week ~10 and continues for ~2 weeks. - Only critical bug fixes are accepted into the release codebase. - -There are approximately two weeks following code freeze, and preceding -release, during which all remaining critical issues must be resolved -before release. This also gives time for documentation finalization. - -When the code base is sufficiently stable, the master branch re-opens -for general development and work begins there for the next release -milestone. Any remaining modifications for the current release are cherry -picked from master back to the release branch. The release is built from -the release branch. - -Following release, the [Release Branch -Manager](https://git.k8s.io/sig-release/release-team/role-handbooks/branch-manager/README.md) -cherry picks additional critical fixes from the master branch for -a period of around 9 months, leaving an overlap of three release -versions forward support. Thus, each release is part of a broader -Kubernetes lifecycle: - - - -## Removal Of Items From The Milestone - -Before getting too far into the process for adding an item to the -milestone, please note: - -Members of the Release Team may remove Issues from the milestone -if they or the responsible SIG determine that the issue is not -actually blocking the release and is unlikely to be resolved in a -timely fashion. - -Members of the Release Team may remove PRs from the milestone for -any of the following, or similar, reasons: - -* PR is potentially de-stabilizing and is not needed to resolve a blocking issue; -* PR is a new, late feature PR and has not gone through the features process or the exception process; -* There is no responsible SIG willing to take ownership of the PR and resolve any follow-up issues with it; -* PR is not correctly labelled; -* Work has visibly halted on the PR and delivery dates are uncertain or late. - -While members of the Release Team will help with labelling and -contacting SIG(s), it is the responsibility of the submitter to -categorize PRs, and to secure support from the relevant SIG to -guarantee that any breakage caused by the PR will be rapidly resolved. - -Where additional action is required, an attempt at human to human -escalation will be made by the release team through the following -channels: - -- Comment in GitHub mentioning the SIG team and SIG members as appropriate for the issue type -- Emailing the SIG mailing list - - bootstrapped with group email addresses from the [community sig list](/sig-list.md) - - optionally also directly addressing SIG leadership or other SIG members -- Messaging the SIG's Slack channel - - bootstrapped with the slackchannel and SIG leadership from the [community sig list](/sig-list.md) - - optionally directly "@" mentioning SIG leadership or others by handle - -## Adding An Item To The Milestone - -### Milestone Maintainers - -The members of the GitHub [“kubernetes-milestone-maintainers” -team](https://github.com/orgs/kubernetes/teams/kubernetes-milestone-maintainers/members) -are entrusted with the responsibility of specifying the release milestone on -GitHub artifacts. This group is [maintained by -SIG-Release](https://git.k8s.io/sig-release/release-team/README.md#milestone-maintainers) -and has representation from the various SIGs' leadership. - -### Feature additions - -Feature planning and definition takes many forms today, but a typical -example might be a large piece of work described in a -[KEP](https://git.k8s.io/community/keps), with associated -task issues in GitHub. When the plan has reached an implementable state and -work is underway, the feature or parts thereof are targeted for an upcoming -milestone by creating GitHub issues and marking them with the Prow "/milestone" -command. - -For the first ~4 weeks into the release cycle, the release team's -Enhancements Lead will interact with SIGs and feature owners via GitHub, -Slack, and SIG meetings to capture all required planning artifacts. - -If you have a feature to target for an upcoming release milestone, begin a -conversation with your SIG leadership and with that release's Enhancements -Lead. - -### Issue additions - -Issues are marked as targeting a milestone via the Prow -"/milestone" command. - -The release team's [Bug Triage -Lead](https://git.k8s.io/sig-release/release-team/role-handbooks/bug-triage/README.md) and overall community watch -incoming issues and triage them, as described in the contributor -guide section on [issue triage](/contributors/guide/issue-triage.md). - -Marking issues with the milestone provides the community better -visibility regarding when an issue was observed and by when the community -feels it must be resolved. During code freeze, to merge a PR it is required -that a release milestone is set. - -An open issue is no longer required for a PR, but open issues and -associated PRs should have synchronized labels. For example a high -priority bug issue might not have its associated PR merged if the PR is -only marked as lower priority. - -### PR Additions - -PRs are marked as targeting a milestone via the Prow -"/milestone" command. - -This is a blocking requirement during code freeze as described above. - -## Other Required Labels - -*Note* [Here is the list of labels and their use and purpose.](https://git.k8s.io/test-infra/label_sync/labels.md#labels-that-apply-to-all-repos-for-both-issues-and-prs) - -### SIG Owner Label - -The SIG owner label defines the SIG to which we escalate if a -milestone issue is languishing or needs additional attention. If -there are no updates after escalation, the issue may be automatically -removed from the milestone. - -These are added with the Prow "/sig" command. For example to add -the label indicating SIG Storage is responsible, comment with `/sig -storage`. - -### Priority Label - -Priority labels are used to determine an escalation path before -moving issues out of the release milestone. They are also used to -determine whether or not a release should be blocked on the resolution -of the issue. - -- `priority/critical-urgent`: Never automatically move out of a release milestone; continually escalate to contributor and SIG through all available channels. - - considered a release blocking issue - - code freeze: issue owner update frequency: daily - - would require a patch release if left undiscovered until after the minor release. -- `priority/important-soon`: Escalate to the issue owners and SIG owner; move out of milestone after several unsuccessful escalation attempts. - - not considered a release blocking issue - - would not require a patch release - - will automatically be moved out of the release milestone at code freeze after a 4 day grace period -- `priority/important-longterm`: Escalate to the issue owners; move out of the milestone after 1 attempt. - - even less urgent / critical than `priority/important-soon` - - moved out of milestone more aggressively than `priority/important-soon` - -### Issue/PR Kind Label - -The issue kind is used to help identify the types of changes going -into the release over time. This may allow the release team to -develop a better understanding of what sorts of issues we would -miss with a faster release cadence. - -For release targeted issues, including pull requests, one of the following -issue kind labels must be set: - -- `kind/api-change`: Adds, removes, or changes an API -- `kind/bug`: Fixes a newly discovered bug. -- `kind/cleanup`: Adding tests, refactoring, fixing old bugs. -- `kind/design`: Related to design -- `kind/documentation`: Adds documentation -- `kind/failing-test`: CI test case is failing consistently. -- `kind/feature`: New functionality. -- `kind/flake`: CI test case is showing intermittent failures. +This file is a placeholder to preserve links. Please remove by April 29, 2019 or the release of kubernetes 1.13, whichever comes first.
\ No newline at end of file diff --git a/contributors/devel/scheduler.md b/contributors/devel/scheduler.md index 486b04a9..6f2ae192 100644 --- a/contributors/devel/scheduler.md +++ b/contributors/devel/scheduler.md @@ -1,90 +1,3 @@ -# The Kubernetes Scheduler - -The Kubernetes scheduler runs as a process alongside the other master components such as the API server. -Its interface to the API server is to watch for Pods with an empty PodSpec.NodeName, -and for each Pod, it posts a binding indicating where the Pod should be scheduled. - -## Exploring the code - -We are dividing scheduler into three layers from high level: -- [cmd/kube-scheduler/scheduler.go](http://releases.k8s.io/HEAD/cmd/kube-scheduler/scheduler.go): - This is the main() entry that does initialization before calling the scheduler framework. -- [pkg/scheduler/scheduler.go](http://releases.k8s.io/HEAD/pkg/scheduler/scheduler.go): - This is the scheduler framework that handles stuff (e.g. binding) beyond the scheduling algorithm. -- [pkg/scheduler/core/generic_scheduler.go](http://releases.k8s.io/HEAD/pkg/scheduler/core/generic_scheduler.go): - The scheduling algorithm that assigns nodes for pods. - -## The scheduling algorithm - -``` -For given pod: - - +---------------------------------------------+ - | Schedulable nodes: | - | | - | +--------+ +--------+ +--------+ | - | | node 1 | | node 2 | | node 3 | | - | +--------+ +--------+ +--------+ | - | | - +-------------------+-------------------------+ - | - | - v - +-------------------+-------------------------+ - - Pred. filters: node 3 doesn't have enough resource - - +-------------------+-------------------------+ - | - | - v - +-------------------+-------------------------+ - | remaining nodes: | - | +--------+ +--------+ | - | | node 1 | | node 2 | | - | +--------+ +--------+ | - | | - +-------------------+-------------------------+ - | - | - v - +-------------------+-------------------------+ - - Priority function: node 1: p=2 - node 2: p=5 - - +-------------------+-------------------------+ - | - | - v - select max{node priority} = node 2 -``` - -The Scheduler tries to find a node for each Pod, one at a time. -- First it applies a set of "predicates" to filter out inappropriate nodes. For example, if the PodSpec specifies resource requests, then the scheduler will filter out nodes that don't have at least that much resources available (computed as the capacity of the node minus the sum of the resource requests of the containers that are already running on the node). -- Second, it applies a set of "priority functions" -that rank the nodes that weren't filtered out by the predicate check. For example, it tries to spread Pods across nodes and zones while at the same time favoring the least (theoretically) loaded nodes (where "load" - in theory - is measured as the sum of the resource requests of the containers running on the node, divided by the node's capacity). -- Finally, the node with the highest priority is chosen (or, if there are multiple such nodes, then one of them is chosen at random). The code for this main scheduling loop is in the function `Schedule()` in [pkg/scheduler/core/generic_scheduler.go](http://releases.k8s.io/HEAD/pkg/scheduler/core/generic_scheduler.go) - -### Predicates and priorities policies - -Predicates are a set of policies applied one by one to filter out inappropriate nodes. -Priorities are a set of policies applied one by one to rank nodes (that made it through the filter of the predicates). -By default, Kubernetes provides built-in predicates and priorities policies documented in [scheduler_algorithm.md](scheduler_algorithm.md). -The predicates and priorities code are defined in [pkg/scheduler/algorithm/predicates/predicates.go](http://releases.k8s.io/HEAD/pkg/scheduler/algorithm/predicates/predicates.go) and [pkg/scheduler/algorithm/priorities](http://releases.k8s.io/HEAD/pkg/scheduler/algorithm/priorities/) , respectively. - - -## Scheduler extensibility - -The scheduler is extensible: the cluster administrator can choose which of the pre-defined -scheduling policies to apply, and can add new ones. - -### Modifying policies - -The policies that are applied when scheduling can be chosen in one of two ways. -The default policies used are selected by the functions `defaultPredicates()` and `defaultPriorities()` in -[pkg/scheduler/algorithmprovider/defaults/defaults.go](http://releases.k8s.io/HEAD/pkg/scheduler/algorithmprovider/defaults/defaults.go). -However, the choice of policies can be overridden by passing the command-line flag `--policy-config-file` to the scheduler, pointing to a JSON file specifying which scheduling policies to use. See [examples/scheduler-policy-config.json](https://git.k8s.io/examples/staging/scheduler-policy-config.json) for an example -config file. (Note that the config file format is versioned; the API is defined in [pkg/scheduler/api](http://releases.k8s.io/HEAD/pkg/scheduler/api/)). -Thus to add a new scheduling policy, you should modify [pkg/scheduler/algorithm/predicates/predicates.go](http://releases.k8s.io/HEAD/pkg/scheduler/algorithm/predicates/predicates.go) or add to the directory [pkg/scheduler/algorithm/priorities](http://releases.k8s.io/HEAD/pkg/scheduler/algorithm/priorities/), and either register the policy in `defaultPredicates()` or `defaultPriorities()`, or use a policy config file. +This file has moved to https://git.k8s.io/community/contributors/devel/sig-scheduling/scheduler.md. +This file is a placeholder to preserve links. Please remove by April 29, 2019 or the release of kubernetes 1.13, whichever comes first.
\ No newline at end of file diff --git a/contributors/devel/scheduler_algorithm.md b/contributors/devel/scheduler_algorithm.md index e6596b47..07af7e49 100644 --- a/contributors/devel/scheduler_algorithm.md +++ b/contributors/devel/scheduler_algorithm.md @@ -1,40 +1,3 @@ -# Scheduler Algorithm in Kubernetes - -For each unscheduled Pod, the Kubernetes scheduler tries to find a node across the cluster according to a set of rules. A general introduction to the Kubernetes scheduler can be found at [scheduler.md](scheduler.md). In this document, the algorithm of how to select a node for the Pod is explained. There are two steps before a destination node of a Pod is chosen. The first step is filtering all the nodes and the second is ranking the remaining nodes to find a best fit for the Pod. - -## Filtering the nodes - -The purpose of filtering the nodes is to filter out the nodes that do not meet certain requirements of the Pod. For example, if the free resource on a node (measured by the capacity minus the sum of the resource requests of all the Pods that already run on the node) is less than the Pod's required resource, the node should not be considered in the ranking phase so it is filtered out. Currently, there are several "predicates" implementing different filtering policies, including: - -- `NoDiskConflict`: Evaluate if a pod can fit due to the volumes it requests, and those that are already mounted. Currently supported volumes are: AWS EBS, GCE PD, ISCSI and Ceph RBD. Only Persistent Volume Claims for those supported types are checked. Persistent Volumes added directly to pods are not evaluated and are not constrained by this policy. -- `NoVolumeZoneConflict`: Evaluate if the volumes a pod requests are available on the node, given the Zone restrictions. -- `PodFitsResources`: Check if the free resource (CPU and Memory) meets the requirement of the Pod. The free resource is measured by the capacity minus the sum of requests of all Pods on the node. To learn more about the resource QoS in Kubernetes, please check [QoS proposal](../design-proposals/node/resource-qos.md). -- `PodFitsHostPorts`: Check if any HostPort required by the Pod is already occupied on the node. -- `HostName`: Filter out all nodes except the one specified in the PodSpec's NodeName field. -- `MatchNodeSelector`: Check if the labels of the node match the labels specified in the Pod's `nodeSelector` field and, as of Kubernetes v1.2, also match the `nodeAffinity` if present. See [here](https://kubernetes.io/docs/user-guide/node-selection/) for more details on both. -- `MaxEBSVolumeCount`: Ensure that the number of attached ElasticBlockStore volumes does not exceed a maximum value (by default, 39, since Amazon recommends a maximum of 40 with one of those 40 reserved for the root volume -- see [Amazon's documentation](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/volume_limits.html#linux-specific-volume-limits)). The maximum value can be controlled by setting the `KUBE_MAX_PD_VOLS` environment variable. -- `MaxGCEPDVolumeCount`: Ensure that the number of attached GCE PersistentDisk volumes does not exceed a maximum value (by default, 16, which is the maximum GCE allows -- see [GCE's documentation](https://cloud.google.com/compute/docs/disks/persistent-disks#limits_for_predefined_machine_types)). The maximum value can be controlled by setting the `KUBE_MAX_PD_VOLS` environment variable. -- `CheckNodeMemoryPressure`: Check if a pod can be scheduled on a node reporting memory pressure condition. Currently, no ``BestEffort`` pods should be placed on a node under memory pressure as it gets automatically evicted by kubelet. -- `CheckNodeDiskPressure`: Check if a pod can be scheduled on a node reporting disk pressure condition. Currently, no pods should be placed on a node under disk pressure as it gets automatically evicted by kubelet. - -The details of the above predicates can be found in [pkg/scheduler/algorithm/predicates/predicates.go](http://releases.k8s.io/HEAD/pkg/scheduler/algorithm/predicates/predicates.go). All predicates mentioned above can be used in combination to perform a sophisticated filtering policy. Kubernetes uses some, but not all, of these predicates by default. You can see which ones are used by default in [pkg/scheduler/algorithmprovider/defaults/defaults.go](http://releases.k8s.io/HEAD/pkg/scheduler/algorithmprovider/defaults/defaults.go). - -## Ranking the nodes - -The filtered nodes are considered suitable to host the Pod, and it is often that there are more than one nodes remaining. Kubernetes prioritizes the remaining nodes to find the "best" one for the Pod. The prioritization is performed by a set of priority functions. For each remaining node, a priority function gives a score which scales from 0-10 with 10 representing for "most preferred" and 0 for "least preferred". Each priority function is weighted by a positive number and the final score of each node is calculated by adding up all the weighted scores. For example, suppose there are two priority functions, `priorityFunc1` and `priorityFunc2` with weighting factors `weight1` and `weight2` respectively, the final score of some NodeA is: - - finalScoreNodeA = (weight1 * priorityFunc1) + (weight2 * priorityFunc2) - -After the scores of all nodes are calculated, the node with highest score is chosen as the host of the Pod. If there are more than one nodes with equal highest scores, a random one among them is chosen. - -Currently, Kubernetes scheduler provides some practical priority functions, including: - -- `LeastRequestedPriority`: The node is prioritized based on the fraction of the node that would be free if the new Pod were scheduled onto the node. (In other words, (capacity - sum of requests of all Pods already on the node - request of Pod that is being scheduled) / capacity). CPU and memory are equally weighted. The node with the highest free fraction is the most preferred. Note that this priority function has the effect of spreading Pods across the nodes with respect to resource consumption. -- `BalancedResourceAllocation`: This priority function tries to put the Pod on a node such that the CPU and Memory utilization rate is balanced after the Pod is deployed. -- `SelectorSpreadPriority`: Spread Pods by minimizing the number of Pods belonging to the same service, replication controller, or replica set on the same node. If zone information is present on the nodes, the priority will be adjusted so that pods are spread across zones and nodes. -- `CalculateAntiAffinityPriority`: Spread Pods by minimizing the number of Pods belonging to the same service on nodes with the same value for a particular label. -- `ImageLocalityPriority`: Nodes are prioritized based on locality of images requested by a pod. Nodes with larger size of already-installed packages required by the pod will be preferred over nodes with no already-installed packages required by the pod or a small total size of already-installed packages required by the pod. -- `NodeAffinityPriority`: (Kubernetes v1.2) Implements `preferredDuringSchedulingIgnoredDuringExecution` node affinity; see [here](https://kubernetes.io/docs/user-guide/node-selection/) for more details. - -The details of the above priority functions can be found in [pkg/scheduler/algorithm/priorities](http://releases.k8s.io/HEAD/pkg/scheduler/algorithm/priorities/). Kubernetes uses some, but not all, of these priority functions by default. You can see which ones are used by default in [pkg/scheduler/algorithmprovider/defaults/defaults.go](http://releases.k8s.io/HEAD/pkg/scheduler/algorithmprovider/defaults/defaults.go). Similar as predicates, you can combine the above priority functions and assign weight factors (positive number) to them as you want (check [scheduler.md](scheduler.md) for how to customize). +This file has moved to https://git.k8s.io/community/contributors/devel/sig-scheduling/scheduler_algorithm.md. +This file is a placeholder to preserve links. Please remove by April 29, 2019 or the release of kubernetes 1.13, whichever comes first.
\ No newline at end of file diff --git a/contributors/devel/sig-instrumentation/event-style-guide.md b/contributors/devel/sig-instrumentation/event-style-guide.md new file mode 100644 index 00000000..bc4ba22b --- /dev/null +++ b/contributors/devel/sig-instrumentation/event-style-guide.md @@ -0,0 +1,51 @@ +# Event style guide + +Status: During Review + +Author: Marek Grabowski (gmarek@) + +## Why the guide? + +The Event API change proposal is the first step towards having useful Events in the system. Another step is to formalize the Event style guide, i.e. set of properties that developers need to ensure when adding new Events to the system. This is necessary to ensure that we have a system in which all components emit consistently structured Events. + +## When to emit an Event? + +Events are expected to provide important insights for the application developer/operator on the state of their application. Events relevant to cluster administrators are acceptable, as well, though they usually also have the option of looking at component logs. Events are much more expensive than logs, thus they're not expected to provide in-depth system debugging information. Instead concentrate on things that are important from the application developer's perspective. Events need to be either actionable, or be useful to understand past or future system's behavior. Events are not intended to drive automation. Watching resource status should be sufficient for controllers. + +Following are the guidelines for adding Events to the system. Those are not hard-and-fast rules, but should be considered by all contributors adding new Events and members doing reviews. +1. Emit events only when state of the system changes/attempts to change. Events "it's still running" are not interesting. Also, changes that do not add information beyond what is observable by watching the altered resources should not be duplicated as events. Note that adding a reason for some action that can't be inferred from the state change is considered additional information. +1. Limit Events to no more than one per change/attempt. There's no need for Events on "About to do X" AND "Did X"/"Failed to do X". Result is more interesting and implies an attempt. + 1. It may give impression that this gets tricky with scale events, e.g. Deployment scales ReplicaSet which creates/deletes Pods. For us those are 3 (or more) separate Events (3 different objects are affected) so it's fine to emit multiple Events. +1. When an error occurs that prevents a user application from starting or from enacting other normal system behavior, such as object creation, an Event should be emitted (e.g. invalid image). + 1. Note that Events are garbage collected so every user-actionable error needs to be surfaced via resource status as well. + 1. It's usually OK to emit failure Events for each failure. Dedup mechanism will deal with that. The exception is failures that are frequent but typically ephemeral and automatically repairable/recoverable, such as broken socket connections, in which case they should only be reported if persistent and unrepairable, in order to mitigate event spam. +1. When a user application stops running for any reason, an Event should be emitted (e.g. Pod evicted because Node is under memory pressure) +1. If it's a system-wide change of state that may impact currently running applications or have an may have severe impact on future workload schedulability, an Event should be emitted (e.g. Node became unreachable, 1. Failed to create route for Node). +1. If it doesn't fit any of above scenarios you should consider not emitting Event. + +## How to structure an Event? +New Event API tries to use more descriptive field names to influence how Events are structured. Event has following fields: +* Regarding +* Related +* ReportingController +* ReportingInstance +* Action +* Reason +* Type +* Note + +The Event should be structured in a way that following sentence "makes sense": +"Regarding <Event.Regarding>: <Event.Action> <Event.Related> - <Event.Reason>", e.g. +* Regarding Node X: BecameNotReady - NodeUnreachable +* Regarding Pod X: ScheduledOnNode Node Y - <nil> +* Regarding PVC X: BoundToNode Node Y - <nil> +* Regarding Pod X: KilledContainer Container Y - NodeMemoryPressure + +1. ReportingController is a type of a Controller reporting an Event, e.g. k8s.io/node-controller, k8s.io/kubelet. There will be a standard list for controller names for Kubernetes components. Third-party components must namespace themselves in the same manner as label keys. Validation ensures it's a proper qualified name. This shouldn’t be needed in order for users to understand the event, but is provided in case the controller’s logs need to be accessed for further debugging. +1. ReportingInstance is an identifier of the instance of the ReportingController which needs to uniquely identify it. I.e. host name can be used only for controllers that are guaranteed to be unique on the host. This requirement isn't met e.g. for scheduler, so it may need a secondary index. For singleton controllers use Node name (or hostname if controller is not running on the Node). Can have at most 128 alpha-numeric characters. +1. Regarding and Related are ObjectReferences. Regarding should represent the object that's implemented by the ReportingController, Related can contain additional information about another object that takes part in or is affected by the Action (see examples). +1. Action is a low-cardinality (meaning that there's a restricted, predefined set of values allowed) CamelCase string field (i.e. its value has to be determined at compile time) that explains what happened with Regarding/what action did the ReportingController take in Regarding's name. The tuple of {ReportingController, Action, Reason} must be unique, such that a user could look up documentation. Can have at most 128 characters. +1. Reason is a low-cardinality CamelCase string field (i.e. its value has to be determined at compile time) that explains why ReportingController took Action. Can have at most 128 characters. +1. Type can be either "Normal" or "Warning". "Warning" types are reserved for Events that represent a situation that's not expected in a healthy cluster and/or healthy workload: something unexpected and/or undesirable, at least if it occurs frequently enough and/or for a long enough duration. +1. Note can contain an arbitrary, high-cardinality, user readable summary of the Event. This field can lose data if deduplication is triggered. Can have at most 1024 characters. + diff --git a/contributors/devel/sig-instrumentation/instrumentation.md b/contributors/devel/sig-instrumentation/instrumentation.md new file mode 100644 index 00000000..b0a11193 --- /dev/null +++ b/contributors/devel/sig-instrumentation/instrumentation.md @@ -0,0 +1,215 @@ +## Instrumenting Kubernetes + +The following references and outlines general guidelines for metric instrumentation +in Kubernetes components. Components are instrumented using the +[Prometheus Go client library](https://github.com/prometheus/client_golang). For non-Go +components. [Libraries in other languages](https://prometheus.io/docs/instrumenting/clientlibs/) +are available. + +The metrics are exposed via HTTP in the +[Prometheus metric format](https://prometheus.io/docs/instrumenting/exposition_formats/), +which is open and well-understood by a wide range of third party applications and vendors +outside of the Prometheus eco-system. + +The [general instrumentation advice](https://prometheus.io/docs/practices/instrumentation/) +from the Prometheus documentation applies. This document reiterates common pitfalls and some +Kubernetes specific considerations. + +Prometheus metrics are cheap as they have minimal internal memory state. Set and increment +operations are thread safe and take 10-25 nanoseconds (Go & Java). +Thus, instrumentation can and should cover all operationally relevant aspects of an application, +internal and external. + +## Quick Start + +The following describes the basic steps required to add a new metric (in Go). + +1. Import "github.com/prometheus/client_golang/prometheus". + +2. Create a top-level var to define the metric. For this, you have to: + + 1. Pick the type of metric. Use a Gauge for things you want to set to a +particular value, a Counter for things you want to increment, or a Histogram or +Summary for histograms/distributions of values (typically for latency). +Histograms are better if you're going to aggregate the values across jobs, while +summaries are better if you just want the job to give you a useful summary of +the values. + 2. Give the metric a name and description. + 3. Pick whether you want to distinguish different categories of things using +labels on the metric. If so, add "Vec" to the name of the type of metric you +want and add a slice of the label names to the definition. + + [Example](https://github.com/kubernetes/kubernetes/blob/cd3299307d44665564e1a5c77d0daa0286603ff5/pkg/apiserver/apiserver.go#L53) + ```go + requestCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "apiserver_request_count", + Help: "Counter of apiserver requests broken out for each verb, API resource, client, and HTTP response code.", + }, + []string{"verb", "resource", "client", "code"}, + ) + ``` + +3. Register the metric so that prometheus will know to export it. + + [Example](https://github.com/kubernetes/kubernetes/blob/cd3299307d44665564e1a5c77d0daa0286603ff5/pkg/apiserver/apiserver.go#L78) + ```go + func init() { + prometheus.MustRegister(requestCounter) + prometheus.MustRegister(requestLatencies) + prometheus.MustRegister(requestLatenciesSummary) + } + ``` + +4. Use the metric by calling the appropriate method for your metric type (Set, +Inc/Add, or Observe, respectively for Gauge, Counter, or Histogram/Summary), +first calling WithLabelValues if your metric has any labels + + [Example](https://github.com/kubernetes/kubernetes/blob/cd3299307d44665564e1a5c77d0daa0286603ff5/pkg/apiserver/apiserver.go#L87) + ```go + requestCounter.WithLabelValues(*verb, *resource, client, strconv.Itoa(*httpCode)).Inc() + ``` + + +## Instrumentation types + +Components have metrics capturing events and states that are inherent to their +application logic. Examples are request and error counters, request latency +histograms, or internal garbage collection cycles. Those metrics are instrumented +directly in the application code. + +Secondly, there are business logic metrics. Those are not about observed application +behavior but abstract system state, such as desired replicas for a deployment. +They are not directly instrumented but collected from otherwise exposed data. + +In Kubernetes they are generally captured in the [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) +component, which reads them from the API server. +For this types of metric exposition, the +[exporter guidelines](https://prometheus.io/docs/instrumenting/writing_exporters/) +apply additionally. + +## Naming + +Metrics added directly by application or package code should have a unique name. +This avoids collisions of metrics added via dependencies. They also clearly +distinguish metrics collected with different semantics. This is solved through +prefixes: + +``` +<component_name>_<metric> +``` + +For example, suppose the kubelet instrumented its HTTP requests but also uses +an HTTP router providing its own implementation. Both expose metrics on total +http requests. They should be distinguishable as in: + +``` +kubelet_http_requests_total{path=”/some/path”,status=”200”} +routerpkg_http_requests_total{path=”/some/path”,status=”200”,method=”GET”} +``` + +As we can see they expose different labels and thus a naming collision would +not have been possible to resolve even if both metrics counted the exact same +requests. + +Resource objects that occur in names should inherit the spelling that is used +in kubectl, i.e. daemon sets are `daemonset` rather than `daemon_set`. + +## Dimensionality & Cardinality + +Metrics can often replace more expensive logging as they are time-aggregated +over a sampling interval. The [multidimensional data model](https://prometheus.io/docs/concepts/data_model/) +enables deep insights and all metrics should use those label dimensions +where appropriate. + +A common error that often causes performance issues in the ingesting metric +system is considering dimensions that inhibit or eliminate time aggregation +by being too specific. Typically those are user IDs or error messages. +More generally: one should know a comprehensive list of all possible values +for a label at instrumentation time. + +Notable exceptions are exporters like kube-state-metrics, which expose per-pod +or per-deployment metrics, which are theoretically unbound over time as one could +constantly create new ones, with new names. However, they have +a reasonable upper bound for a given size of infrastructure they refer to and +its typical frequency of changes. + +In general, “external” labels like pod or node name do not belong in the +instrumentation itself. They are to be attached to metrics by the collecting +system that has the external knowledge ([blog post](https://www.robustperception.io/target-labels-are-for-life-not-just-for-christmas/)). + +## Normalization + +Metrics should be normalized with respect to their dimensions. They should +expose the minimal set of labels, each of which provides additional information. +Labels that are composed from values of different labels are not desirable. +For example: + +``` +example_metric{pod=”abc”,container=”proxy”,container_long=”abc/proxy”} +``` + +It often seems feasible to add additional meta information about an object +to all metrics about that object, e.g.: + +``` +kube_pod_container_restarts{namespace=...,pod=...,container=...} +``` + +A common use case is wanting to look at such metrics w.r.t to the node the +pod is scheduled on. So it seems convenient to add a “node” label. + +``` +kube_pod_container_restarts{namespace=...,pod=...,container=...,node=...} +``` + +This however only caters to one specific query use case. There are many more +pieces of metadata that could be added, effectively blowing up the instrumentation. +They are also not guaranteed to be stable over time. What if pods at some +point can be live migrated? +Those pieces of information should be normalized into an info-level metric +([blog post](https://www.robustperception.io/exposing-the-software-version-to-prometheus/)), +which is always set to 1. For example: + +``` +kube_pod_info{pod=...,namespace=...,pod_ip=...,host_ip=..,node=..., ...} +``` + +The metric system can later denormalize those along the identifying labels +“pod” and “namespace” labels. This leads to... + +## Resource Referencing + +It is often desirable to correlate different metrics about a common object, +such as a pod. Label dimensions can be used to match up different metrics. +This is most easy if label names and values are following a common pattern. +For metrics exposed by the same application, that often happens naturally. + +For a system composed of several independent, and also pluggable components, +it makes sense to set cross-component standards to allow easy querying in +metric systems without extensive post-processing of data. +In Kubernetes, those are the resource objects such as deployments, +pods, or services and the namespace they belong to. + +The following should be consistently used: + +``` +example_metric_ccc{pod=”example-app-5378923”, namespace=”default”} +``` + +An object is referenced by its unique name in a label named after the resource +itself (i.e. `pod`/`deployment`/... and not `pod_name`/`deployment_name`) +and the namespace it belongs to in the `namespace` label. + +Note: namespace/name combinations are only unique at a certain point in time. +For time series this is given by the timestamp associated with any data point. +UUIDs are truly unique but not convenient to use in user-facing time series +queries. +They can still be incorporated using an info level metric as described above for +`kube_pod_info`. A query to a metric system selecting by UUID via a the info level +metric could look as follows: + +``` +kube_pod_restarts and on(namespace, pod) kube_pod_info{uuid=”ABC”} +``` + diff --git a/contributors/devel/sig-instrumentation/logging.md b/contributors/devel/sig-instrumentation/logging.md new file mode 100644 index 00000000..c4da6829 --- /dev/null +++ b/contributors/devel/sig-instrumentation/logging.md @@ -0,0 +1,34 @@ +## Logging Conventions + +The following conventions for the klog levels to use. +[klog](http://godoc.org/github.com/kubernetes/klog) is globally preferred to +[log](http://golang.org/pkg/log/) for better runtime control. + +* klog.Errorf() - Always an error + +* klog.Warningf() - Something unexpected, but probably not an error + +* klog.Infof() has multiple levels: + * klog.V(0) - Generally useful for this to ALWAYS be visible to an operator + * Programmer errors + * Logging extra info about a panic + * CLI argument handling + * klog.V(1) - A reasonable default log level if you don't want verbosity. + * Information about config (listening on X, watching Y) + * Errors that repeat frequently that relate to conditions that can be corrected (pod detected as unhealthy) + * klog.V(2) - Useful steady state information about the service and important log messages that may correlate to significant changes in the system. This is the recommended default log level for most systems. + * Logging HTTP requests and their exit code + * System state changing (killing pod) + * Controller state change events (starting pods) + * Scheduler log messages + * klog.V(3) - Extended information about changes + * More info about system state changes + * klog.V(4) - Debug level verbosity + * Logging in particularly thorny parts of code where you may want to come back later and check it + * klog.V(5) - Trace level verbosity + * Context to understand the steps leading up to errors and warnings + * More information for troubleshooting reported issues + +As per the comments, the practical default level is V(2). Developers and QE +environments may wish to run at V(3) or V(4). If you wish to change the log +level, you can pass in `-v=X` where X is the desired maximum level to log. diff --git a/contributors/devel/sig-release/cherry-picks.md b/contributors/devel/sig-release/cherry-picks.md new file mode 100644 index 00000000..7769f970 --- /dev/null +++ b/contributors/devel/sig-release/cherry-picks.md @@ -0,0 +1,73 @@ +# Overview + +This document explains how cherry-picks are managed on release branches within +the kubernetes/kubernetes repository. +A common use case for this task is backporting PRs from master to release +branches. + +## Prerequisites + * [Contributor License Agreement](http://git.k8s.io/community/CLA.md) is + considered implicit for all code within cherry-pick pull requests, + **unless there is a large conflict**. + * A pull request merged against the master branch. + * [Release branch](https://git.k8s.io/release/docs/branching.md) exists. + * The normal git and GitHub configured shell environment for pushing to your + kubernetes `origin` fork on GitHub and making a pull request against a + configured remote `upstream` that tracks + "https://github.com/kubernetes/kubernetes.git", including `GITHUB_USER`. + * Have `hub` installed, which is most easily installed via `go get + github.com/github/hub` assuming you have a standard golang development + environment. + +## Initiate a Cherry-pick + * Run the [cherry-pick + script](https://git.k8s.io/kubernetes/hack/cherry_pick_pull.sh). + This example applies a master branch PR #98765 to the remote branch + `upstream/release-3.14`: `hack/cherry_pick_pull.sh upstream/release-3.14 + 98765` + * Be aware the cherry-pick script assumes you have a git remote called + `upstream` that points at the Kubernetes github org. + Please see our [recommended Git workflow](https://git.k8s.io/community/contributors/guide/github-workflow.md#workflow). + * You will need to run the cherry-pick script separately for each patch release you want to cherry-pick to. + + * Your cherry-pick PR will immediately get the `do-not-merge/cherry-pick-not-approved` label. + The [Branch Manager](https://git.k8s.io/sig-release/release-team/role-handbooks/branch-manager) + will triage PRs targeted to the next .0 minor release branch up until the + release, while the [Patch Release Team](https://git.k8s.io/sig-release/release-team/role-handbooks/patch-release-manager) + will handle all cherry-picks to patch releases. + Normal rules apply for code merge. + * Reviewers `/lgtm` and owners `/approve` as they deem appropriate. + * Milestones on cherry-pick PRs should be the milestone for the target + release branch (for example, milestone 1.11 for a cherry-pick onto + release-1.11). + * You can find the current release team members in the + [appropriate release folder](https://git.k8s.io/sig-release/releases) for the target release. + You may cc them with `<@githubusername>` on your cherry-pick PR. + +## Cherry-pick Review + +Cherry-pick pull requests have an additional requirement compared to normal pull +requests. +They must be approved specifically for cherry-pick by Approvers. +The [Branch Manager](https://git.k8s.io/sig-release/release-team/role-handbooks/branch-manager) +or the [Patch Release Team](https://git.k8s.io/sig-release/release-team/role-handbooks/patch-release-manager) +are the final authority on removing the `do-not-merge/cherry-pick-not-approved` +label and triggering a merge into the target branch. + +## Searching for Cherry-picks + +- [A sample search on kubernetes/kubernetes pull requests that are labeled as `cherry-pick-approved`](https://github.com/kubernetes/kubernetes/pulls?q=is%3Aopen+is%3Apr+label%3Acherry-pick-approved) + +- [A sample search on kubernetes/kubernetes pull requests that are labeled as `do-not-merge/cherry-pick-not-approved`](https://github.com/kubernetes/kubernetes/pulls?q=is%3Aopen+is%3Apr+label%3Ado-not-merge%2Fcherry-pick-not-approved) + + +## Troubleshooting Cherry-picks + +Contributors may encounter some of the following difficulties when initiating a cherry-pick. + +- A cherry-pick PR does not apply cleanly against an old release branch. +In that case, you will need to manually fix conflicts. + +- The cherry-pick PR includes code that does not pass CI tests. +In such a case you will have to fetch the auto-generated branch from your fork, amend the problematic commit and force push to the auto-generated branch. +Alternatively, you can create a new PR, which is noisier. diff --git a/contributors/devel/sig-release/getting-builds.md b/contributors/devel/sig-release/getting-builds.md new file mode 100644 index 00000000..0ae7031b --- /dev/null +++ b/contributors/devel/sig-release/getting-builds.md @@ -0,0 +1,48 @@ +# Getting Kubernetes Builds + +You can use [hack/get-build.sh](http://releases.k8s.io/HEAD/hack/get-build.sh) +to get a build or to use as a reference on how to get the most recent builds +with curl. With `get-build.sh` you can grab the most recent stable build, the +most recent release candidate, or the most recent build to pass our ci and gce +e2e tests (essentially a nightly build). + +Run `./hack/get-build.sh -h` for its usage. + +To get a build at a specific version (v1.1.1) use: + +```console +./hack/get-build.sh v1.1.1 +``` + +To get the latest stable release: + +```console +./hack/get-build.sh release/stable +``` + +Use the "-v" option to print the version number of a build without retrieving +it. For example, the following prints the version number for the latest ci +build: + +```console +./hack/get-build.sh -v ci/latest +``` + +You can also use the gsutil tool to explore the Google Cloud Storage release +buckets. Here are some examples: + +```sh +gsutil cat gs://kubernetes-release-dev/ci/latest.txt # output the latest ci version number +gsutil cat gs://kubernetes-release-dev/ci/latest-green.txt # output the latest ci version number that passed gce e2e +gsutil ls gs://kubernetes-release-dev/ci/v0.20.0-29-g29a55cc/ # list the contents of a ci release +gsutil ls gs://kubernetes-release/release # list all official releases and rcs +``` + +## Install `gsutil` + +Example installation: + +```console +$ curl -sSL https://storage.googleapis.com/pub/gsutil.tar.gz | sudo tar -xz -C /usr/local/src +$ sudo ln -s /usr/local/src/gsutil/gsutil /usr/bin/gsutil +``` diff --git a/contributors/devel/sig-release/release.md b/contributors/devel/sig-release/release.md new file mode 100644 index 00000000..b4e9224e --- /dev/null +++ b/contributors/devel/sig-release/release.md @@ -0,0 +1,307 @@ +# Targeting Features, Issues and PRs to Release Milestones + +This document is focused on Kubernetes developers and contributors +who need to create a feature, issue, or pull request which targets a specific +release milestone. + +- [TL;DR](#tldr) +- [Definitions](#definitions) +- [The Release Cycle](#the-release-cycle) +- [Removal Of Items From The Milestone](#removal-of-items-from-the-milestone) +- [Adding An Item To The Milestone](#adding-an-item-to-the-milestone) + - [Milestone Maintainers](#milestone-maintainers) + - [Feature additions](#feature-additions) + - [Issue additions](#issue-additions) + - [PR Additions](#pr-additions) +- [Other Required Labels](#other-required-labels) + - [SIG Owner Label](#sig-owner-label) + - [Priority Label](#priority-label) + - [Issue Kind Label](#issue-kind-label) + +The process for shepherding features, issues, and pull requests +into a Kubernetes release spans multiple stakeholders: +* the feature, issue, or pull request owner +* SIG leadership +* the release team + +Information on workflows and interactions are described below. + +As the owner of a feature, issue, or pull request (PR), it is your +responsibility to ensure release milestone requirements are met. +Automation and the release team will be in contact with you if +updates are required, but inaction can result in your work being +removed from the milestone. Additional requirements exist when the +target milestone is a prior release (see [cherry pick +process](cherry-picks.md) for more information). + +## TL;DR + +If you want your PR to get merged, it needs the following required labels and milestones, represented here by the Prow /commands it would take to add them: +<table> +<tr> +<td></td> +<td>Normal Dev</td> +<td>Code Freeze</td> +<td>Post-Release</td> +</tr> +<tr> +<td></td> +<td>Weeks 1-8</td> +<td>Weeks 9-11</td> +<td>Weeks 11+</td> +</tr> +<tr> +<td>Required Labels</td> +<td> +<ul> +<!--Weeks 1-8--> +<li>/sig {name}</li> +<li>/kind {type}</li> +<li>/lgtm</li> +<li>/approved</li> +</ul> +</td> +<td> +<ul> +<!--Weeks 9-11--> +<li>/milestone {v1.y}</li> +<li>/sig {name}</li> +<li>/kind {bug, failing-test}</li> +<li>/priority critical-urgent</li> +<li>/lgtm</li> +<li>/approved</li> +</ul> +</td> +<td> +<!--Weeks 11+--> +Return to 'Normal Dev' phase requirements: +<ul> +<li>/sig {name}</li> +<li>/kind {type}</li> +<li>/lgtm</li> +<li>/approved</li> +</ul> + +Merges into the 1.y branch are now [via cherrypicks](https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md), approved by release branch manager. +</td> +<td> +<ul> +</td> +</tr> +</table> + +In the past there was a requirement for a milestone targeted pull +request to have an associated GitHub issue opened, but this is no +longer the case. Features are effectively GitHub issues or +[KEPs](https://git.k8s.io/community/keps) +which lead to subsequent PRs. The general labeling process should +be consistent across artifact types. + +--- + +## Definitions + +- *issue owners*: Creator, assignees, and user who moved the issue into a release milestone. +- *release team*: Each Kubernetes release has a team doing project + management tasks described + [here](https://git.k8s.io/sig-release/release-team/README.md). The + contact info for the team associated with any given release can be + found [here](https://git.k8s.io/sig-release/releases/). +- *Y days*: Refers to business days (using the location local to the release-manager M-F). +- *feature*: see "[Is My Thing a Feature?](http://git.k8s.io/features/README.md#is-my-thing-a-feature) +- *release milestone*: semantic version string or [GitHub milestone](https://help.github.com/articles/associating-milestones-with-issues-and-pull-requests/) referring to a release MAJOR.MINOR vX.Y version. See also [release versioning](http://git.k8s.io/community/contributors/design-proposals/release/versioning.md) +- *release branch*: Git branch "release-X.Y" created for the vX.Y milestone. Created at the time of the vX.Y-beta.0 release and maintained after the release for approximately 9 months with vX.Y.Z patch releases. + +## The Release Cycle + + + +Kubernetes releases currently happen four times per year. The release +process can be thought of as having three main phases: +* Feature Definition +* Implementation +* Stabilization + +But in reality this is an open source and agile project, with feature +planning and implementation happening at all times. Given the +project scale and globally distributed developer base, it is critical +to project velocity to not rely on a trailing stabilization phase and +rather have continuous integration testing which ensures the +project is always stable so that individual commits can be +flagged as having broken something. + +With ongoing feature definition through the year, some set of items +will bubble up as targeting a given release. The **enhancement freeze** +starts ~4 weeks into release cycle. By this point all intended +feature work for the given release has been defined in suitable +planning artifacts in conjunction with the Release Team's [enhancements +lead](https://git.k8s.io/sig-release/release-team/role-handbooks/enhancements/README.md). + +Implementation and bugfixing is ongoing across the cycle, but +culminates in a code freeze period: +* The **code freeze** starts in week ~10 and continues for ~2 weeks. + Only critical bug fixes are accepted into the release codebase. + +There are approximately two weeks following code freeze, and preceding +release, during which all remaining critical issues must be resolved +before release. This also gives time for documentation finalization. + +When the code base is sufficiently stable, the master branch re-opens +for general development and work begins there for the next release +milestone. Any remaining modifications for the current release are cherry +picked from master back to the release branch. The release is built from +the release branch. + +Following release, the [Release Branch +Manager](https://git.k8s.io/sig-release/release-team/role-handbooks/branch-manager/README.md) +cherry picks additional critical fixes from the master branch for +a period of around 9 months, leaving an overlap of three release +versions forward support. Thus, each release is part of a broader +Kubernetes lifecycle: + + + +## Removal Of Items From The Milestone + +Before getting too far into the process for adding an item to the +milestone, please note: + +Members of the Release Team may remove Issues from the milestone +if they or the responsible SIG determine that the issue is not +actually blocking the release and is unlikely to be resolved in a +timely fashion. + +Members of the Release Team may remove PRs from the milestone for +any of the following, or similar, reasons: + +* PR is potentially de-stabilizing and is not needed to resolve a blocking issue; +* PR is a new, late feature PR and has not gone through the features process or the exception process; +* There is no responsible SIG willing to take ownership of the PR and resolve any follow-up issues with it; +* PR is not correctly labelled; +* Work has visibly halted on the PR and delivery dates are uncertain or late. + +While members of the Release Team will help with labelling and +contacting SIG(s), it is the responsibility of the submitter to +categorize PRs, and to secure support from the relevant SIG to +guarantee that any breakage caused by the PR will be rapidly resolved. + +Where additional action is required, an attempt at human to human +escalation will be made by the release team through the following +channels: + +- Comment in GitHub mentioning the SIG team and SIG members as appropriate for the issue type +- Emailing the SIG mailing list + - bootstrapped with group email addresses from the [community sig list](/sig-list.md) + - optionally also directly addressing SIG leadership or other SIG members +- Messaging the SIG's Slack channel + - bootstrapped with the slackchannel and SIG leadership from the [community sig list](/sig-list.md) + - optionally directly "@" mentioning SIG leadership or others by handle + +## Adding An Item To The Milestone + +### Milestone Maintainers + +The members of the GitHub [“kubernetes-milestone-maintainers” +team](https://github.com/orgs/kubernetes/teams/kubernetes-milestone-maintainers/members) +are entrusted with the responsibility of specifying the release milestone on +GitHub artifacts. This group is [maintained by +SIG-Release](https://git.k8s.io/sig-release/release-team/README.md#milestone-maintainers) +and has representation from the various SIGs' leadership. + +### Feature additions + +Feature planning and definition takes many forms today, but a typical +example might be a large piece of work described in a +[KEP](https://git.k8s.io/community/keps), with associated +task issues in GitHub. When the plan has reached an implementable state and +work is underway, the feature or parts thereof are targeted for an upcoming +milestone by creating GitHub issues and marking them with the Prow "/milestone" +command. + +For the first ~4 weeks into the release cycle, the release team's +Enhancements Lead will interact with SIGs and feature owners via GitHub, +Slack, and SIG meetings to capture all required planning artifacts. + +If you have a feature to target for an upcoming release milestone, begin a +conversation with your SIG leadership and with that release's Enhancements +Lead. + +### Issue additions + +Issues are marked as targeting a milestone via the Prow +"/milestone" command. + +The release team's [Bug Triage +Lead](https://git.k8s.io/sig-release/release-team/role-handbooks/bug-triage/README.md) and overall community watch +incoming issues and triage them, as described in the contributor +guide section on [issue triage](/contributors/guide/issue-triage.md). + +Marking issues with the milestone provides the community better +visibility regarding when an issue was observed and by when the community +feels it must be resolved. During code freeze, to merge a PR it is required +that a release milestone is set. + +An open issue is no longer required for a PR, but open issues and +associated PRs should have synchronized labels. For example a high +priority bug issue might not have its associated PR merged if the PR is +only marked as lower priority. + +### PR Additions + +PRs are marked as targeting a milestone via the Prow +"/milestone" command. + +This is a blocking requirement during code freeze as described above. + +## Other Required Labels + +*Note* [Here is the list of labels and their use and purpose.](https://git.k8s.io/test-infra/label_sync/labels.md#labels-that-apply-to-all-repos-for-both-issues-and-prs) + +### SIG Owner Label + +The SIG owner label defines the SIG to which we escalate if a +milestone issue is languishing or needs additional attention. If +there are no updates after escalation, the issue may be automatically +removed from the milestone. + +These are added with the Prow "/sig" command. For example to add +the label indicating SIG Storage is responsible, comment with `/sig +storage`. + +### Priority Label + +Priority labels are used to determine an escalation path before +moving issues out of the release milestone. They are also used to +determine whether or not a release should be blocked on the resolution +of the issue. + +- `priority/critical-urgent`: Never automatically move out of a release milestone; continually escalate to contributor and SIG through all available channels. + - considered a release blocking issue + - code freeze: issue owner update frequency: daily + - would require a patch release if left undiscovered until after the minor release. +- `priority/important-soon`: Escalate to the issue owners and SIG owner; move out of milestone after several unsuccessful escalation attempts. + - not considered a release blocking issue + - would not require a patch release + - will automatically be moved out of the release milestone at code freeze after a 4 day grace period +- `priority/important-longterm`: Escalate to the issue owners; move out of the milestone after 1 attempt. + - even less urgent / critical than `priority/important-soon` + - moved out of milestone more aggressively than `priority/important-soon` + +### Issue/PR Kind Label + +The issue kind is used to help identify the types of changes going +into the release over time. This may allow the release team to +develop a better understanding of what sorts of issues we would +miss with a faster release cadence. + +For release targeted issues, including pull requests, one of the following +issue kind labels must be set: + +- `kind/api-change`: Adds, removes, or changes an API +- `kind/bug`: Fixes a newly discovered bug. +- `kind/cleanup`: Adding tests, refactoring, fixing old bugs. +- `kind/design`: Related to design +- `kind/documentation`: Adds documentation +- `kind/failing-test`: CI test case is failing consistently. +- `kind/feature`: New functionality. +- `kind/flake`: CI test case is showing intermittent failures. diff --git a/contributors/devel/sig-scalability/kubemark-guide.md b/contributors/devel/sig-scalability/kubemark-guide.md new file mode 100644 index 00000000..ce5727e8 --- /dev/null +++ b/contributors/devel/sig-scalability/kubemark-guide.md @@ -0,0 +1,256 @@ +# Kubemark User Guide + +## Introduction + +Kubemark is a performance testing tool which allows users to run experiments on +simulated clusters. The primary use case is scalability testing, as simulated +clusters can be much bigger than the real ones. The objective is to expose +problems with the master components (API server, controller manager or +scheduler) that appear only on bigger clusters (e.g. small memory leaks). + +This document serves as a primer to understand what Kubemark is, what it is not, +and how to use it. + +## Architecture + +On a very high level, Kubemark cluster consists of two parts: a real master +and a set of “Hollow” Nodes. The prefix “Hollow” to any component means an +implementation/instantiation of the actual component with all “moving” +parts mocked out. The best example is HollowKubelet, which pretends to be an +ordinary Kubelet, but does not start anything, nor mount any volumes - it just +lies it does. More detailed design and implementation details are at the end +of this document. + +Currently, master components run on a dedicated machine as pods that are +created/managed by kubelet, which itself runs as either a systemd or a supervisord +service on the master VM depending on the VM distro (though currently it is +only systemd as we use a GCI image). Having a dedicated machine for the master +has a slight advantage over running the master components on an external cluster, +which is being able to completely isolate master resources from everything else. +The HollowNodes on the other hand are run on an ‘external’ Kubernetes cluster +as pods in an isolated namespace (named kubemark). This idea of using pods on a +real cluster behave (or act) as nodes on the kubemark cluster lies at the heart of +kubemark's design. + +## Requirements + +To run Kubemark you need a Kubernetes cluster (called `external cluster`) +for running all your HollowNodes and a dedicated machine for a master. +Master machine has to be directly routable from HollowNodes. You also need +access to a Docker repository (which is gcr.io in the case of GCE) that has the +container images for etcd, hollow-node and node-problem-detector. + +Currently, scripts are written to be easily usable by GCE, but it should be +relatively straightforward to port them to different providers or bare metal. +There is an ongoing effort to refactor kubemark code into provider-specific (gce) +and provider-independent code, which should make it relatively simple to run +kubemark clusters on other cloud providers as well. + +## Common use cases and helper scripts + +Common workflow for Kubemark is: +- starting a Kubemark cluster (on GCE) +- running e2e tests on Kubemark cluster +- monitoring test execution and debugging problems +- turning down Kubemark cluster + +(For now) Included in descriptions there will be comments helpful for anyone who’ll +want to port Kubemark to different providers. +(Later) When the refactoring mentioned in the above section finishes, we would replace +these comments with a clean API that would allow kubemark to run on top of any provider. + +### Starting a Kubemark cluster + +To start a Kubemark cluster on GCE you need to create an external kubernetes +cluster (it can be GCE, GKE or anything else) by yourself, make sure that kubeconfig +points to it by default, build a kubernetes release (e.g. by running +`make quick-release`) and run `test/kubemark/start-kubemark.sh` script. +This script will create a VM for master (along with mounted PD and firewall rules set), +then start kubelet and run the pods for the master components. Following this, it +sets up the HollowNodes as Pods on the external cluster and do all the setup necessary +to let them talk to the kubemark apiserver. It will use the configuration stored in +`cluster/kubemark/config-default.sh` - you can tweak it however you want, but note that +some features may not be implemented yet, as implementation of Hollow components/mocks +will probably be lagging behind ‘real’ one. For performance tests interesting variables +are `NUM_NODES` and `KUBEMARK_MASTER_SIZE`. After start-kubemark script is finished, +you’ll have a ready Kubemark cluster, and a kubeconfig file for talking to the Kubemark +cluster is stored in `test/kubemark/resources/kubeconfig.kubemark`. + +Currently we're running HollowNode with a limit of 0.09 CPU core/pod and 220MB of memory. +However, if we also take into account the resources absorbed by default cluster addons +and fluentD running on the 'external' cluster, this limit becomes ~0.1 CPU core/pod, +thus allowing ~10 HollowNodes to run per core (on an "n1-standard-8" VM node). + +#### Behind the scene details: + +start-kubemark.sh script does quite a lot of things: + +- Prepare a master machine named MASTER_NAME (this variable's value should be set by this point): + (*the steps below use gcloud, and should be easy to do outside of GCE*) + 1. Creates a Persistent Disk for use by the master (one more for etcd-events, if flagged) + 2. Creates a static IP address for the master in the cluster and assign it to variable MASTER_IP + 3. Creates a VM instance for the master, configured with the PD and IP created above. + 4. Set firewall rule in the master to open port 443\* for all TCP traffic by default. + +<sub>\* Port 443 is a secured port on the master machine which is used for all +external communication with the API server. In the last sentence *external* +means all traffic coming from other machines, including all the Nodes, not only +from outside of the cluster. Currently local components, i.e. ControllerManager +and Scheduler talk with API server using insecure port 8080.</sub> + +- [Optional to read] Establish necessary certs/keys required for setting up the PKI for kubemark cluster: + (*the steps below are independent of GCE and work for all providers*) + 1. Generate a randomly named temporary directory for storing PKI certs/keys which is delete-trapped on EXIT. + 2. Create a bearer token for 'admin' in master. + 3. Generate certificate for CA and (certificate + private-key) pair for each of master, kubelet and kubecfg. + 4. Generate kubelet and kubeproxy tokens for master. + 5. Write a kubeconfig locally to `test/kubemark/resources/kubeconfig.kubemark` for enabling local kubectl use. + +- Set up environment and start master components (through `start-kubemark-master.sh` script): + (*the steps below use gcloud for SSH and SCP to master, and should be easy to do outside of GCE*) + 1. SSH to the master machine and create a new directory (`/etc/srv/kubernetes`) and write all the + certs/keys/tokens/passwords to it. + 2. SCP all the master pod manifests, shell scripts (`start-kubemark-master.sh`, `configure-kubectl.sh`, etc), + config files for passing env variables (`kubemark-master-env.sh`) from the local machine to the master. + 3. SSH to the master machine and run the startup script `start-kubemark-master.sh` (and possibly others). + + Note: The directory structure and the functions performed by the startup script(s) can vary based on master distro. + We currently support the GCI image `gci-dev-56-8977-0-0` in GCE. + +- Set up and start HollowNodes (as pods) on the external cluster: + (*the steps below (except 2nd and 3rd) are independent of GCE and work for all providers*) + 1. Identify the right kubemark binary from the current kubernetes repo for the platform linux/amd64. + 2. Create a Docker image for HollowNode using this binary and upload it to a remote Docker repository. + (We use gcr.io/ as our remote docker repository in GCE, should be different for other providers) + 3. [One-off] Create and upload a Docker image for NodeProblemDetector (see kubernetes/node-problem-detector repo), + which is one of the containers in the HollowNode pod, besides HollowKubelet and HollowProxy. However we + use it with a hollow config that essentially has an empty set of rules and conditions to be detected. + This step is required only for other cloud providers, as the docker image for GCE already exists on GCR. + 4. Create secret which stores kubeconfig for use by HollowKubelet/HollowProxy, addons, and configMaps + for the HollowNode and the HollowNodeProblemDetector. + 5. Create a ReplicationController for HollowNodes that starts them up, after replacing all variables in + the hollow-node_template.json resource. + 6. Wait until all HollowNodes are in the Running phase. + +### Running e2e tests on Kubemark cluster + +To run standard e2e test on your Kubemark cluster created in the previous step +you execute `test/kubemark/run-e2e-tests.sh` script. It will configure ginkgo to +use Kubemark cluster instead of something else and start an e2e test. This +script should not need any changes to work on other cloud providers. + +By default (if nothing will be passed to it) the script will run a Density '30 +test. If you want to run a different e2e test you just need to provide flags you want to be +passed to `hack/ginkgo-e2e.sh` script, e.g. `--ginkgo.focus="Load"` to run the +Load test. + +By default, at the end of each test, it will delete namespaces and everything +under it (e.g. events, replication controllers) on Kubemark master, which takes +a lot of time. Such work aren't needed in most cases: if you delete your +Kubemark cluster after running `run-e2e-tests.sh`; you don't care about +namespace deletion performance, specifically related to etcd; etc. There is a +flag that enables you to avoid namespace deletion: `--delete-namespace=false`. +Adding the flag should let you see in logs: `Found DeleteNamespace=false, +skipping namespace deletion!` + +### Monitoring test execution and debugging problems + +Run-e2e-tests prints the same output on Kubemark as on ordinary e2e cluster, but +if you need to dig deeper you need to learn how to debug HollowNodes and how +Master machine (currently) differs from the ordinary one. + +If you need to debug master machine you can do similar things as you do on your +ordinary master. The difference between Kubemark setup and ordinary setup is +that in Kubemark etcd is run as a plain docker container, and all master +components are run as normal processes. There's no Kubelet overseeing them. Logs +are stored in exactly the same place, i.e. `/var/logs/` directory. Because +binaries are not supervised by anything they won't be restarted in the case of a +crash. + +To help you with debugging from inside the cluster startup script puts a +`~/configure-kubectl.sh` script on the master. It downloads `gcloud` and +`kubectl` tool and configures kubectl to work on unsecured master port (useful +if there are problems with security). After the script is run you can use +kubectl command from the master machine to play with the cluster. + +Debugging HollowNodes is a bit more tricky, as if you experience a problem on +one of them you need to learn which hollow-node pod corresponds to a given +HollowNode known by the Master. During self-registeration HollowNodes provide +their cluster IPs as Names, which means that if you need to find a HollowNode +named `10.2.4.5` you just need to find a Pod in external cluster with this +cluster IP. There's a helper script +`test/kubemark/get-real-pod-for-hollow-node.sh` that does this for you. + +When you have a Pod name you can use `kubectl logs` on external cluster to get +logs, or use a `kubectl describe pod` call to find an external Node on which +this particular HollowNode is running so you can ssh to it. + +E.g. you want to see the logs of HollowKubelet on which pod `my-pod` is running. +To do so you can execute: + +``` +$ kubectl kubernetes/test/kubemark/resources/kubeconfig.kubemark describe pod my-pod +``` + +Which outputs pod description and among it a line: + +``` +Node: 1.2.3.4/1.2.3.4 +``` + +To learn the `hollow-node` pod corresponding to node `1.2.3.4` you use +aforementioned script: + +``` +$ kubernetes/test/kubemark/get-real-pod-for-hollow-node.sh 1.2.3.4 +``` + +which will output the line: + +``` +hollow-node-1234 +``` + +Now you just use ordinary kubectl command to get the logs: + +``` +kubectl --namespace=kubemark logs hollow-node-1234 +``` + +All those things should work exactly the same on all cloud providers. + +### Turning down Kubemark cluster + +On GCE you just need to execute `test/kubemark/stop-kubemark.sh` script, which +will delete HollowNode ReplicationController and all the resources for you. On +other providers you’ll need to delete all this stuff by yourself. As part of +the effort mentioned above to refactor kubemark into provider-independent and +provider-specific parts, the resource deletion logic specific to the provider +would move out into a clean API. + +## Some current implementation details and future roadmap + +Kubemark master uses exactly the same binaries as ordinary Kubernetes does. This +means that it will never be out of date. On the other hand HollowNodes use +existing fake for Kubelet (called SimpleKubelet), which mocks its runtime +manager with `pkg/kubelet/dockertools/fake_manager.go`, where most logic sits. +Because there's no easy way of mocking other managers (e.g. VolumeManager), they +are not supported in Kubemark (e.g. we can't schedule Pods with volumes in them +yet). + +We currently plan to extend kubemark along the following directions: +- As you would have noticed at places above, we aim to make kubemark more structured + and easy to run across various providers without having to tweak the setup scripts, + using a well-defined kubemark-provider API. +- Allow kubemark to run on various distros (GCI, debian, redhat, etc) for any + given provider. +- Make Kubemark performance on ci-tests mimic real cluster ci-tests on metrics such as + CPU, memory and network bandwidth usage and realizing this goal through measurable + objectives (like the kubemark metric should vary no more than X% with the real + cluster metric). We could also use metrics reported by Prometheus. +- Improve logging of CI-test metrics (such as aggregated API call latencies, scheduling + call latencies, %ile for CPU/mem usage of different master components in density/load + tests) by packing them into well-structured artifacts instead of the (current) dumping + to logs. +- Create a Dashboard that lets easy viewing and comparison of these metrics across tests. + diff --git a/contributors/devel/sig-scalability/profiling.md b/contributors/devel/sig-scalability/profiling.md new file mode 100644 index 00000000..f7c8b2e5 --- /dev/null +++ b/contributors/devel/sig-scalability/profiling.md @@ -0,0 +1,76 @@ +# Profiling Kubernetes + +This document explain how to plug in profiler and how to profile Kubernetes services. To get familiar with the tools mentioned below, it is strongly recommended to read [Profiling Go Programs](https://blog.golang.org/profiling-go-programs). + +## Profiling library + +Go comes with inbuilt 'net/http/pprof' profiling library and profiling web service. The way service works is binding debug/pprof/ subtree on a running webserver to the profiler. Reading from subpages of debug/pprof returns pprof-formatted profiles of the running binary. The output can be processed offline by the tool of choice, or used as an input to handy 'go tool pprof', which can graphically represent the result. + +## Adding profiling to services to APIserver. + +TL;DR: Add lines: + +```go +m.mux.HandleFunc("/debug/pprof/", pprof.Index) +m.mux.HandleFunc("/debug/pprof/profile", pprof.Profile) +m.mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol) +``` + +to the `init(c *Config)` method in 'pkg/master/master.go' and import 'net/http/pprof' package. + +In most use cases to use profiler service it's enough to do 'import _ net/http/pprof', which automatically registers a handler in the default http.Server. Slight inconvenience is that APIserver uses default server for intra-cluster communication, so plugging profiler to it is not really useful. In 'pkg/kubelet/server/server.go' more servers are created and started as separate goroutines. The one that is usually serving external traffic is secureServer. The handler for this traffic is defined in 'pkg/master/master.go' and stored in Handler variable. It is created from HTTP multiplexer, so the only thing that needs to be done is adding profiler handler functions to this multiplexer. This is exactly what lines after TL;DR do. + +## Connecting to the profiler + +Even when running profiler I found not really straightforward to use 'go tool pprof' with it. The problem is that at least for dev purposes certificates generated for APIserver are not signed by anyone trusted and because secureServer serves only secure traffic it isn't straightforward to connect to the service. The best workaround I found is by creating an ssh tunnel from the kubernetes_master open unsecured port to some external server, and use this server as a proxy. To save everyone looking for correct ssh flags, it is done by running: + +```sh +ssh kubernetes_master -L<local_port>:localhost:8080 +``` + +or analogous one for you Cloud provider. Afterwards you can e.g. run + +```sh +go tool pprof http://localhost:<local_port>/debug/pprof/profile +``` + +to get 30 sec. CPU profile. + +## Contention profiling + +To enable contention profiling you need to add line `rt.SetBlockProfileRate(1)` in addition to `m.mux.HandleFunc(...)` added before (`rt` stands for `runtime` in `master.go`). This enables 'debug/pprof/block' subpage, which can be used as an input to `go tool pprof`. + +## Profiling in tests + +To gather a profile from a test, the HTTP interface is probably not suitable. Instead, you can add the `-cpuprofile` flag to your KUBE_TEST_ARGS, e.g. + +```sh +make test-integration WHAT="./test/integration/scheduler" KUBE_TEST_ARGS="-cpuprofile cpu.out" +go tool pprof cpu.out +``` + +See the ['go test' flags](https://golang.org/cmd/go/#hdr-Description_of_testing_flags) for how to capture other types of profiles. + +## Profiling in a benchmark test + +Gathering a profile from a benchmark test works in the same way as regular tests, but sometimes there may be expensive setup that you want excluded from the profile. (i.e. any time you would use `b.ResetTimer()`) + +To solve this problem, you can explicitly start the profile in your test code like so. + +```go +func BenchmarkMyFeature(b *testing.B) { + // Expensive test setup... + b.ResetTimer() + f, err := os.Create("bench_profile.out") + if err != nil { + log.Fatal("could not create profile file: ", err) + } + if err := pprof.StartCPUProfile(f); err != nil { + log.Fatal("could not start CPU profile: ", err) + } + defer pprof.StopCPUProfile() + // Rest of the test... +} +``` + +> Note: Code added to a test to gather CPU profiles should not be merged. It is meant to be temporary while you create an analyze profiles. diff --git a/contributors/devel/sig-scheduling/scheduler.md b/contributors/devel/sig-scheduling/scheduler.md new file mode 100644 index 00000000..486b04a9 --- /dev/null +++ b/contributors/devel/sig-scheduling/scheduler.md @@ -0,0 +1,90 @@ +# The Kubernetes Scheduler + +The Kubernetes scheduler runs as a process alongside the other master components such as the API server. +Its interface to the API server is to watch for Pods with an empty PodSpec.NodeName, +and for each Pod, it posts a binding indicating where the Pod should be scheduled. + +## Exploring the code + +We are dividing scheduler into three layers from high level: +- [cmd/kube-scheduler/scheduler.go](http://releases.k8s.io/HEAD/cmd/kube-scheduler/scheduler.go): + This is the main() entry that does initialization before calling the scheduler framework. +- [pkg/scheduler/scheduler.go](http://releases.k8s.io/HEAD/pkg/scheduler/scheduler.go): + This is the scheduler framework that handles stuff (e.g. binding) beyond the scheduling algorithm. +- [pkg/scheduler/core/generic_scheduler.go](http://releases.k8s.io/HEAD/pkg/scheduler/core/generic_scheduler.go): + The scheduling algorithm that assigns nodes for pods. + +## The scheduling algorithm + +``` +For given pod: + + +---------------------------------------------+ + | Schedulable nodes: | + | | + | +--------+ +--------+ +--------+ | + | | node 1 | | node 2 | | node 3 | | + | +--------+ +--------+ +--------+ | + | | + +-------------------+-------------------------+ + | + | + v + +-------------------+-------------------------+ + + Pred. filters: node 3 doesn't have enough resource + + +-------------------+-------------------------+ + | + | + v + +-------------------+-------------------------+ + | remaining nodes: | + | +--------+ +--------+ | + | | node 1 | | node 2 | | + | +--------+ +--------+ | + | | + +-------------------+-------------------------+ + | + | + v + +-------------------+-------------------------+ + + Priority function: node 1: p=2 + node 2: p=5 + + +-------------------+-------------------------+ + | + | + v + select max{node priority} = node 2 +``` + +The Scheduler tries to find a node for each Pod, one at a time. +- First it applies a set of "predicates" to filter out inappropriate nodes. For example, if the PodSpec specifies resource requests, then the scheduler will filter out nodes that don't have at least that much resources available (computed as the capacity of the node minus the sum of the resource requests of the containers that are already running on the node). +- Second, it applies a set of "priority functions" +that rank the nodes that weren't filtered out by the predicate check. For example, it tries to spread Pods across nodes and zones while at the same time favoring the least (theoretically) loaded nodes (where "load" - in theory - is measured as the sum of the resource requests of the containers running on the node, divided by the node's capacity). +- Finally, the node with the highest priority is chosen (or, if there are multiple such nodes, then one of them is chosen at random). The code for this main scheduling loop is in the function `Schedule()` in [pkg/scheduler/core/generic_scheduler.go](http://releases.k8s.io/HEAD/pkg/scheduler/core/generic_scheduler.go) + +### Predicates and priorities policies + +Predicates are a set of policies applied one by one to filter out inappropriate nodes. +Priorities are a set of policies applied one by one to rank nodes (that made it through the filter of the predicates). +By default, Kubernetes provides built-in predicates and priorities policies documented in [scheduler_algorithm.md](scheduler_algorithm.md). +The predicates and priorities code are defined in [pkg/scheduler/algorithm/predicates/predicates.go](http://releases.k8s.io/HEAD/pkg/scheduler/algorithm/predicates/predicates.go) and [pkg/scheduler/algorithm/priorities](http://releases.k8s.io/HEAD/pkg/scheduler/algorithm/priorities/) , respectively. + + +## Scheduler extensibility + +The scheduler is extensible: the cluster administrator can choose which of the pre-defined +scheduling policies to apply, and can add new ones. + +### Modifying policies + +The policies that are applied when scheduling can be chosen in one of two ways. +The default policies used are selected by the functions `defaultPredicates()` and `defaultPriorities()` in +[pkg/scheduler/algorithmprovider/defaults/defaults.go](http://releases.k8s.io/HEAD/pkg/scheduler/algorithmprovider/defaults/defaults.go). +However, the choice of policies can be overridden by passing the command-line flag `--policy-config-file` to the scheduler, pointing to a JSON file specifying which scheduling policies to use. See [examples/scheduler-policy-config.json](https://git.k8s.io/examples/staging/scheduler-policy-config.json) for an example +config file. (Note that the config file format is versioned; the API is defined in [pkg/scheduler/api](http://releases.k8s.io/HEAD/pkg/scheduler/api/)). +Thus to add a new scheduling policy, you should modify [pkg/scheduler/algorithm/predicates/predicates.go](http://releases.k8s.io/HEAD/pkg/scheduler/algorithm/predicates/predicates.go) or add to the directory [pkg/scheduler/algorithm/priorities](http://releases.k8s.io/HEAD/pkg/scheduler/algorithm/priorities/), and either register the policy in `defaultPredicates()` or `defaultPriorities()`, or use a policy config file. + diff --git a/contributors/devel/sig-scheduling/scheduler_algorithm.md b/contributors/devel/sig-scheduling/scheduler_algorithm.md new file mode 100644 index 00000000..e6596b47 --- /dev/null +++ b/contributors/devel/sig-scheduling/scheduler_algorithm.md @@ -0,0 +1,40 @@ +# Scheduler Algorithm in Kubernetes + +For each unscheduled Pod, the Kubernetes scheduler tries to find a node across the cluster according to a set of rules. A general introduction to the Kubernetes scheduler can be found at [scheduler.md](scheduler.md). In this document, the algorithm of how to select a node for the Pod is explained. There are two steps before a destination node of a Pod is chosen. The first step is filtering all the nodes and the second is ranking the remaining nodes to find a best fit for the Pod. + +## Filtering the nodes + +The purpose of filtering the nodes is to filter out the nodes that do not meet certain requirements of the Pod. For example, if the free resource on a node (measured by the capacity minus the sum of the resource requests of all the Pods that already run on the node) is less than the Pod's required resource, the node should not be considered in the ranking phase so it is filtered out. Currently, there are several "predicates" implementing different filtering policies, including: + +- `NoDiskConflict`: Evaluate if a pod can fit due to the volumes it requests, and those that are already mounted. Currently supported volumes are: AWS EBS, GCE PD, ISCSI and Ceph RBD. Only Persistent Volume Claims for those supported types are checked. Persistent Volumes added directly to pods are not evaluated and are not constrained by this policy. +- `NoVolumeZoneConflict`: Evaluate if the volumes a pod requests are available on the node, given the Zone restrictions. +- `PodFitsResources`: Check if the free resource (CPU and Memory) meets the requirement of the Pod. The free resource is measured by the capacity minus the sum of requests of all Pods on the node. To learn more about the resource QoS in Kubernetes, please check [QoS proposal](../design-proposals/node/resource-qos.md). +- `PodFitsHostPorts`: Check if any HostPort required by the Pod is already occupied on the node. +- `HostName`: Filter out all nodes except the one specified in the PodSpec's NodeName field. +- `MatchNodeSelector`: Check if the labels of the node match the labels specified in the Pod's `nodeSelector` field and, as of Kubernetes v1.2, also match the `nodeAffinity` if present. See [here](https://kubernetes.io/docs/user-guide/node-selection/) for more details on both. +- `MaxEBSVolumeCount`: Ensure that the number of attached ElasticBlockStore volumes does not exceed a maximum value (by default, 39, since Amazon recommends a maximum of 40 with one of those 40 reserved for the root volume -- see [Amazon's documentation](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/volume_limits.html#linux-specific-volume-limits)). The maximum value can be controlled by setting the `KUBE_MAX_PD_VOLS` environment variable. +- `MaxGCEPDVolumeCount`: Ensure that the number of attached GCE PersistentDisk volumes does not exceed a maximum value (by default, 16, which is the maximum GCE allows -- see [GCE's documentation](https://cloud.google.com/compute/docs/disks/persistent-disks#limits_for_predefined_machine_types)). The maximum value can be controlled by setting the `KUBE_MAX_PD_VOLS` environment variable. +- `CheckNodeMemoryPressure`: Check if a pod can be scheduled on a node reporting memory pressure condition. Currently, no ``BestEffort`` pods should be placed on a node under memory pressure as it gets automatically evicted by kubelet. +- `CheckNodeDiskPressure`: Check if a pod can be scheduled on a node reporting disk pressure condition. Currently, no pods should be placed on a node under disk pressure as it gets automatically evicted by kubelet. + +The details of the above predicates can be found in [pkg/scheduler/algorithm/predicates/predicates.go](http://releases.k8s.io/HEAD/pkg/scheduler/algorithm/predicates/predicates.go). All predicates mentioned above can be used in combination to perform a sophisticated filtering policy. Kubernetes uses some, but not all, of these predicates by default. You can see which ones are used by default in [pkg/scheduler/algorithmprovider/defaults/defaults.go](http://releases.k8s.io/HEAD/pkg/scheduler/algorithmprovider/defaults/defaults.go). + +## Ranking the nodes + +The filtered nodes are considered suitable to host the Pod, and it is often that there are more than one nodes remaining. Kubernetes prioritizes the remaining nodes to find the "best" one for the Pod. The prioritization is performed by a set of priority functions. For each remaining node, a priority function gives a score which scales from 0-10 with 10 representing for "most preferred" and 0 for "least preferred". Each priority function is weighted by a positive number and the final score of each node is calculated by adding up all the weighted scores. For example, suppose there are two priority functions, `priorityFunc1` and `priorityFunc2` with weighting factors `weight1` and `weight2` respectively, the final score of some NodeA is: + + finalScoreNodeA = (weight1 * priorityFunc1) + (weight2 * priorityFunc2) + +After the scores of all nodes are calculated, the node with highest score is chosen as the host of the Pod. If there are more than one nodes with equal highest scores, a random one among them is chosen. + +Currently, Kubernetes scheduler provides some practical priority functions, including: + +- `LeastRequestedPriority`: The node is prioritized based on the fraction of the node that would be free if the new Pod were scheduled onto the node. (In other words, (capacity - sum of requests of all Pods already on the node - request of Pod that is being scheduled) / capacity). CPU and memory are equally weighted. The node with the highest free fraction is the most preferred. Note that this priority function has the effect of spreading Pods across the nodes with respect to resource consumption. +- `BalancedResourceAllocation`: This priority function tries to put the Pod on a node such that the CPU and Memory utilization rate is balanced after the Pod is deployed. +- `SelectorSpreadPriority`: Spread Pods by minimizing the number of Pods belonging to the same service, replication controller, or replica set on the same node. If zone information is present on the nodes, the priority will be adjusted so that pods are spread across zones and nodes. +- `CalculateAntiAffinityPriority`: Spread Pods by minimizing the number of Pods belonging to the same service on nodes with the same value for a particular label. +- `ImageLocalityPriority`: Nodes are prioritized based on locality of images requested by a pod. Nodes with larger size of already-installed packages required by the pod will be preferred over nodes with no already-installed packages required by the pod or a small total size of already-installed packages required by the pod. +- `NodeAffinityPriority`: (Kubernetes v1.2) Implements `preferredDuringSchedulingIgnoredDuringExecution` node affinity; see [here](https://kubernetes.io/docs/user-guide/node-selection/) for more details. + +The details of the above priority functions can be found in [pkg/scheduler/algorithm/priorities](http://releases.k8s.io/HEAD/pkg/scheduler/algorithm/priorities/). Kubernetes uses some, but not all, of these priority functions by default. You can see which ones are used by default in [pkg/scheduler/algorithmprovider/defaults/defaults.go](http://releases.k8s.io/HEAD/pkg/scheduler/algorithmprovider/defaults/defaults.go). Similar as predicates, you can combine the above priority functions and assign weight factors (positive number) to them as you want (check [scheduler.md](scheduler.md) for how to customize). + diff --git a/contributors/devel/sig-storage/flexvolume.md b/contributors/devel/sig-storage/flexvolume.md new file mode 100644 index 00000000..12c46382 --- /dev/null +++ b/contributors/devel/sig-storage/flexvolume.md @@ -0,0 +1,155 @@ +# Flexvolume + +Flexvolume enables users to write their own drivers and add support for their volumes in Kubernetes. Vendor drivers should be installed in the volume plugin path on every node, and on master if the driver requires attach capability (unless `--enable-controller-attach-detach` Kubelet option is set to false, but this is highly discouraged because it is a legacy mode of operation). + +Flexvolume is a GA feature from Kubernetes 1.8 release onwards. + +## Prerequisites + +Install the vendor driver on all nodes (also on master nodes if "--enable-controller-attach-detach" Kubelet option is enabled) in the plugin path. Path for installing the plugin: `<plugindir>/<vendor~driver>/<driver>`. The default plugin directory is `/usr/libexec/kubernetes/kubelet-plugins/volume/exec/`. It can be changed in kubelet via the `--volume-plugin-dir` flag, and in controller manager via the `--flex-volume-plugin-dir` flag. + +For example to add a `cifs` driver, by vendor `foo` install the driver at: `/usr/libexec/kubernetes/kubelet-plugins/volume/exec/foo~cifs/cifs` + +The vendor and driver names must match flexVolume.driver in the volume spec, with '~' replaced with '/'. For example, if `flexVolume.driver` is set to `foo/cifs`, then the vendor is `foo`, and driver is `cifs`. + +## Dynamic Plugin Discovery +Beginning in v1.8, Flexvolume supports the ability to detect drivers on the fly. Instead of requiring drivers to exist at system initialization time or having to restart kubelet or controller manager, drivers can be installed, upgraded/downgraded, and uninstalled while the system is running. +For more information, please refer to the [design document](/contributors/design-proposals/storage/flexvolume-deployment.md). + +## Automated Plugin Installation/Upgrade +One possible way to install and upgrade your Flexvolume drivers is by using a DaemonSet. See [Recommended Driver Deployment Method](/contributors/design-proposals/storage/flexvolume-deployment.md#recommended-driver-deployment-method) for details, and see [here](https://git.k8s.io/examples/staging/volumes/flexvolume/deploy/) for an example. + +## Plugin details +The plugin expects the following call-outs are implemented for the backend drivers. Some call-outs are optional. Call-outs are invoked from Kubelet and Controller Manager. + +### Driver invocation model: + +#### Init: +Initializes the driver. Called during Kubelet & Controller manager initialization. On success, the function returns a capabilities map showing whether each Flexvolume capability is supported by the driver. +Current capabilities: +* `attach` - a boolean field indicating whether the driver requires attach and detach operations. This field is *required*, although for backward-compatibility the default value is set to `true`, i.e. requires attach and detach. +See [Driver output](#driver-output) for the capabilities map format. +``` +<driver executable> init +``` + +#### Attach: +Attach the volume specified by the given spec on the given node. On success, returns the device path where the device is attached on the node. Called from Controller Manager. + +This call-out does not pass "secrets" specified in Flexvolume spec. If your driver requires secrets, do not implement this call-out and instead use "mount" call-out and implement attach and mount in that call-out. + +``` +<driver executable> attach <json options> <node name> +``` + +#### Detach: +Detach the volume from the node. Called from Controller Manager. +``` +<driver executable> detach <mount device> <node name> +``` + +#### Wait for attach: +Wait for the volume to be attached on the remote node. On success, the path to the device is returned. Called from Controller Manager. The timeout should be 10m (based on https://git.k8s.io/kubernetes/pkg/kubelet/volumemanager/volume_manager.go#L88 ) + +``` +<driver executable> waitforattach <mount device> <json options> +``` + +#### Volume is Attached: +Check the volume is attached on the node. Called from Controller Manager. + +``` +<driver executable> isattached <json options> <node name> +``` + +#### Mount device: +Mount device mounts the device to a global path which individual pods can then bind mount. Called only from Kubelet. + +This call-out does not pass "secrets" specified in Flexvolume spec. If your driver requires secrets, do not implement this call-out and instead use "mount" call-out and implement attach and mount in that call-out. + +``` +<driver executable> mountdevice <mount dir> <mount device> <json options> +``` + +#### Unmount device: +Unmounts the global mount for the device. This is called once all bind mounts have been unmounted. Called only from Kubelet. + +``` +<driver executable> unmountdevice <mount device> +``` +In addition to the user-specified options and [default JSON options](#default-json-options), the following options capturing information about the pod are passed through and generated automatically. + +``` +kubernetes.io/pod.name +kubernetes.io/pod.namespace +kubernetes.io/pod.uid +kubernetes.io/serviceAccount.name +``` + +#### Mount: +Mount the volume at the mount dir. This call-out defaults to bind mount for drivers which implement attach & mount-device call-outs. Called only from Kubelet. + +``` +<driver executable> mount <mount dir> <json options> +``` + +#### Unmount: +Unmount the volume. This call-out defaults to bind mount for drivers which implement attach & mount-device call-outs. Called only from Kubelet. + +``` +<driver executable> unmount <mount dir> +``` + +See [lvm] & [nfs] for a quick example on how to write a simple flexvolume driver. + +### Driver output: + +Flexvolume expects the driver to reply with the status of the operation in the +following format. + +``` +{ + "status": "<Success/Failure/Not supported>", + "message": "<Reason for success/failure>", + "device": "<Path to the device attached. This field is valid only for attach & waitforattach call-outs>" + "volumeName": "<Cluster wide unique name of the volume. Valid only for getvolumename call-out>" + "attached": <True/False (Return true if volume is attached on the node. Valid only for isattached call-out)> + "capabilities": <Only included as part of the Init response> + { + "attach": <True/False (Return true if the driver implements attach and detach)> + } +} +``` + +### Default Json options + +In addition to the flags specified by the user in the Options field of the FlexVolumeSource, the following flags (set through their corresponding FlexVolumeSource fields) are also passed to the executable. +Note: Secrets are passed only to "mount/unmount" call-outs. + +``` +"kubernetes.io/fsType":"<FS type>", +"kubernetes.io/readwrite":"<rw>", +"kubernetes.io/fsGroup":"<FS group>", +"kubernetes.io/mountsDir":"<string>", +"kubernetes.io/pvOrVolumeName":"<Volume name if the volume is in-line in the pod spec; PV name if the volume is a PV>" + +"kubernetes.io/pod.name":"<string>", +"kubernetes.io/pod.namespace":"<string>", +"kubernetes.io/pod.uid":"<string>", +"kubernetes.io/serviceAccount.name":"<string>", + +"kubernetes.io/secret/key1":"<secret1>" +... +"kubernetes.io/secret/keyN":"<secretN>" +``` + +### Example of Flexvolume + +Please refer to the [Flexvolume example directory]. See [nginx-lvm.yaml] & [nginx-nfs.yaml] for a quick example on how to use Flexvolume in a pod. + + +[lvm]: https://git.k8s.io/examples/staging/volumes/flexvolume/lvm +[nfs]: https://git.k8s.io/examples/staging/volumes/flexvolume/nfs +[nginx-lvm.yaml]: https://git.k8s.io/examples/staging/volumes/flexvolume/nginx-lvm.yaml +[nginx-nfs.yaml]: https://git.k8s.io/examples/staging/volumes/flexvolume/nginx-nfs.yaml +[Flexvolume example directory]: https://git.k8s.io/examples/staging/volumes/flexvolume/ diff --git a/contributors/devel/sig-testing/bazel.md b/contributors/devel/sig-testing/bazel.md new file mode 100644 index 00000000..991a0ac2 --- /dev/null +++ b/contributors/devel/sig-testing/bazel.md @@ -0,0 +1,184 @@ +# Build and test with Bazel + +Building and testing Kubernetes with Bazel is supported but not yet default. + +Bazel is used to run all Kubernetes PRs on [Prow](https://prow.k8s.io), +as remote caching enables significantly reduced build and test times. + +Some repositories (such as kubernetes/test-infra) have switched to using Bazel +exclusively for all build, test, and release workflows. + +Go rules are managed by the [`gazelle`](https://github.com/bazelbuild/rules_go/tree/master/go/tools/gazelle) +tool, with some additional rules managed by the [`kazel`](https://git.k8s.io/repo-infra/kazel) tool. +These tools are called via the `hack/update-bazel.sh` script. + +Instructions for installing Bazel +can be found [here](https://www.bazel.io/versions/master/docs/install.html). + +Several convenience `make` rules have been created for common operations: + +* `make bazel-build`: builds all binaries in tree (`bazel build -- //... + -//vendor/...`) +* `make bazel-test`: runs all unit tests (`bazel test --config=unit -- //... + //hack:verify-all -//build/... -//vendor/...`) +* `make bazel-test-integration`: runs all integration tests (`bazel test + --config integration //test/integration/...`) +* `make bazel-release`: builds release tarballs, Docker images (for server + components), and Debian images (`bazel build //build/release-tars`) + +You can also interact with Bazel directly; for example, to run all `kubectl` unit +tests, run + +```console +$ bazel test //pkg/kubectl/... +``` + +## Planter +If you don't want to install Bazel, you can instead try using the unofficial +[Planter](https://git.k8s.io/test-infra/planter) tool, +which runs Bazel inside a Docker container. + +For example, you can run +```console +$ ../test-infra/planter/planter.sh make bazel-test +$ ../test-infra/planter/planter.sh bazel build //cmd/kubectl +``` + +## Continuous Integration + +There are several bazel CI jobs: +* [ci-kubernetes-bazel-build](http://k8s-testgrid.appspot.com/google-unit#bazel-build): builds everything + with Bazel +* [ci-kubernetes-bazel-test](http://k8s-testgrid.appspot.com/google-unit#bazel-test): runs unit tests in + with Bazel + +Similar jobs are run on all PRs; additionally, several of the e2e jobs use +Bazel-built binaries when launching and testing Kubernetes clusters. + +## Updating `BUILD` files + +To update `BUILD` files, run: + +```console +$ ./hack/update-bazel.sh +``` + +To prevent Go rules from being updated, consult the [gazelle +documentation](https://github.com/bazelbuild/rules_go/tree/master/go/tools/gazelle). + +Note that much like Go files and `gofmt`, `BUILD` files have standardized, +opinionated style rules, and running `hack/update-bazel.sh` will format them for you. + +If you want to auto-format `BUILD` files in your editor, use of +[Buildifier](https://github.com/bazelbuild/buildtools/blob/master/buildifier/README.md) +is recommended. + +Updating the `BUILD` file for a package will be required when: +* Files are added to or removed from a package +* Import dependencies change for a package +* A `BUILD` file has been updated and needs to be reformatted +* A new `BUILD` file has been added (parent `BUILD` files will be updated) + +## Known issues and limitations + +### [Cross-compilation of cgo is not currently natively supported](https://github.com/bazelbuild/rules_go/issues/1020) +All binaries are currently built for the host OS and architecture running Bazel. +(For example, you can't currently target linux/amd64 from macOS or linux/s390x +from an amd64 machine.) + +The Go rules support cross-compilation of pure Go code using the `--platforms` +flag, and this is being used successfully in the kubernetes/test-infra repo. + +It may already be possible to cross-compile cgo code if a custom CC toolchain is +set up, possibly reusing the kube-cross Docker image, but this area needs +further exploration. + +### The CC toolchain is not fully hermetic +Bazel requires several tools and development packages to be installed in the system, including `gcc`, `g++`, `glibc and libstdc++ development headers` and `glibc static development libraries`. Please check your distribution for exact names of the packages. Examples for some commonly used distributions are below: + +| Dependency | Debian/Ubuntu | CentOS | OpenSuSE | +|:---------------------:|-------------------------------|--------------------------------|-----------------------------------------| +| Build essentials | `apt install build-essential` | `yum groupinstall development` | `zypper install -t pattern devel_C_C++` | +| GCC C++ | `apt install g++` | `yum install gcc-c++` | `zypper install gcc-c++` | +| GNU Libc static files | `apt install libc6-dev` | `yum install glibc-static` | `zypper install glibc-devel-static` | + +If any of these packages change, they may also cause spurious build failures +as described in [this issue](https://github.com/bazelbuild/bazel/issues/4907). + +An example error might look something like +``` +ERROR: undeclared inclusion(s) in rule '//vendor/golang.org/x/text/cases:go_default_library.cgo_c_lib': +this rule is missing dependency declarations for the following files included by 'vendor/golang.org/x/text/cases/linux_amd64_stripped/go_default_library.cgo_codegen~/_cgo_export.c': + '/usr/lib/gcc/x86_64-linux-gnu/7/include/stddef.h' +``` + +The only way to recover from this error is to force Bazel to regenerate its +automatically-generated CC toolchain configuration by running `bazel clean +--expunge`. + +Improving cgo cross-compilation may help with all of this. + +### Changes to Go imports requires updating BUILD files +The Go rules in `BUILD` and `BUILD.bazel` files must be updated any time files +are added or removed or Go imports are changed. These rules are automatically +maintained by `gazelle`, which is run via `hack/update-bazel.sh`, but this is +still a source of friction. + +[Autogazelle](https://github.com/bazelbuild/bazel-gazelle/tree/master/cmd/autogazelle) +is a new experimental tool which may reduce or remove the need for developers +to run `hack/update-bazel.sh`, but no work has yet been done to support it in +kubernetes/kubernetes. + +### Code coverage support is incomplete for Go +Bazel and the Go rules have limited support for code coverage. Running something +like `bazel coverage -- //... -//vendor/...` will run tests in coverage mode, +but no report summary is currently generated. It may be possible to combine +`bazel coverage` with +[Gopherage](https://github.com/kubernetes/test-infra/tree/master/gopherage), +however. + +### Kubernetes code generators are not fully supported +The make-based build system in kubernetes/kubernetes runs several code +generators at build time: +* [conversion-gen](https://github.com/kubernetes/code-generator/tree/master/cmd/conversion-gen) +* [deepcopy-gen](https://github.com/kubernetes/code-generator/tree/master/cmd/deepcopy-gen) +* [defaulter-gen](https://github.com/kubernetes/code-generator/tree/master/cmd/defaulter-gen) +* [openapi-gen](https://github.com/kubernetes/kube-openapi/tree/master/cmd/openapi-gen) +* [go-bindata](https://github.com/jteeuwen/go-bindata/tree/master/go-bindata) + +Of these, only `openapi-gen` and `go-bindata` are currently supported when +building Kubernetes with Bazel. + +The `go-bindata` generated code is produced by hand-written genrules. + +The other code generators use special build tags of the form `// ++k8s:generator-name=arg`; for example, input files to the openapi-gen tool are +specified with `// +k8s:openapi-gen=true`. + +`kazel` is used to find all packages that require OpenAPI generation, and then a +handwritten genrule consumes this list of packages to run `openapi-gen`. + +For `openapi-gen`, a single output file is produced in a single Go package, which +makes this fairly compatible with Bazel. +All other Kubernetes code generators generally produce one output file per input +package, which is less compatible with the Bazel workflow. + +The make-based build system batches up all input packages into one call to the +code generator binary, but this is inefficient for Bazel's incrementality, as a +change in one package may result in unnecessarily recompiling many other +packages. +On the other hand, calling the code generator binary multiple times is less +efficient than calling it once, since many of the generators parse the tree for +Go type information and other metadata. + +One additional challenge is that many of the code generators add additional +Go imports which `gazelle` (and `autogazelle`) cannot infer, and so they must be +explicitly added as dependencies in the `BUILD` files. + +Kubernetes has even more code generators than this limited list, but the rest +are generally run as `hack/update-*.sh` scripts and checked into the repository, +and so are not immediately needed for Bazel parity. + +## Contacts +For help or discussion, join the [#bazel](https://kubernetes.slack.com/messages/bazel) +channel on Kubernetes Slack. diff --git a/contributors/devel/sig-testing/e2e-tests.md b/contributors/devel/sig-testing/e2e-tests.md new file mode 100644 index 00000000..e01a896f --- /dev/null +++ b/contributors/devel/sig-testing/e2e-tests.md @@ -0,0 +1,764 @@ +# End-to-End Testing in Kubernetes + +**Table of Contents** + +- [End-to-End Testing in Kubernetes](#end-to-end-testing-in-kubernetes) + - [Overview](#overview) + - [Building Kubernetes and Running the Tests](#building-kubernetes-and-running-the-tests) + - [Cleaning up](#cleaning-up) + - [Advanced testing](#advanced-testing) + - [Extracting a specific version of kubernetes](#extracting-a-specific-version-of-kubernetes) + - [Bringing up a cluster for testing](#bringing-up-a-cluster-for-testing) + - [Federation e2e tests](#federation-e2e-tests) + - [Configuring federation e2e tests](#configuring-federation-e2e-tests) + - [Image Push Repository](#image-push-repository) + - [Build](#build) + - [Deploy federation control plane](#deploy-federation-control-plane) + - [Run the Tests](#run-the-tests) + - [Teardown](#teardown) + - [Shortcuts for test developers](#shortcuts-for-test-developers) + - [Debugging clusters](#debugging-clusters) + - [Local clusters](#local-clusters) + - [Testing against local clusters](#testing-against-local-clusters) + - [Version-skewed and upgrade testing](#version-skewed-and-upgrade-testing) + - [Test jobs naming convention](#test-jobs-naming-convention) + - [Kinds of tests](#kinds-of-tests) + - [Viper configuration and hierarchichal test parameters.](#viper-configuration-and-hierarchichal-test-parameters) + - [Conformance tests](#conformance-tests) + - [Continuous Integration](#continuous-integration) + - [What is CI?](#what-is-ci) + - [What runs in CI?](#what-runs-in-ci) + - [Non-default tests](#non-default-tests) + - [The PR-builder](#the-pr-builder) + - [Adding a test to CI](#adding-a-test-to-ci) + - [Moving a test out of CI](#moving-a-test-out-of-ci) + - [Performance Evaluation](#performance-evaluation) + - [One More Thing](#one-more-thing) + + +## Overview + +End-to-end (e2e) tests for Kubernetes provide a mechanism to test end-to-end +behavior of the system, and is the last signal to ensure end user operations +match developer specifications. Although unit and integration tests provide a +good signal, in a distributed system like Kubernetes it is not uncommon that a +minor change may pass all unit and integration tests, but cause unforeseen +changes at the system level. + +The primary objectives of the e2e tests are to ensure a consistent and reliable +behavior of the kubernetes code base, and to catch hard-to-test bugs before +users do, when unit and integration tests are insufficient. + +The e2e tests in kubernetes are built atop of +[Ginkgo](http://onsi.github.io/ginkgo/) and +[Gomega](http://onsi.github.io/gomega/). There are a host of features that this +Behavior-Driven Development (BDD) testing framework provides, and it is +recommended that the developer read the documentation prior to diving into the + tests. + +The purpose of *this* document is to serve as a primer for developers who are +looking to execute or add tests using a local development environment. + +Before writing new tests or making substantive changes to existing tests, you +should also read [Writing Good e2e Tests](writing-good-e2e-tests.md) + +## Building Kubernetes and Running the Tests + +There are a variety of ways to run e2e tests, but we aim to decrease the number +of ways to run e2e tests to a canonical way: `kubetest`. + +You can install `kubetest` as follows: +```sh +go get -u k8s.io/test-infra/kubetest +``` + +You can run an end-to-end test which will bring up a master and nodes, perform +some tests, and then tear everything down. Make sure you have followed the +getting started steps for your chosen cloud platform (which might involve +changing the --provider flag value to something other than "gce"). + +You can quickly recompile the e2e testing framework via `go install ./test/e2e`. +This will not do anything besides allow you to verify that the go code compiles. +If you want to run your e2e testing framework without re-provisioning the e2e setup, +you can do so via `make WHAT=test/e2e/e2e.test`, and then re-running the ginkgo tests. + +To build Kubernetes, up a cluster, run tests, and tear everything down, use: + +```sh +kubetest --build --up --test --down +``` + +If you'd like to just perform one of these steps, here are some examples: + +```sh +# Build binaries for testing +kubetest --build + +# Create a fresh cluster. Deletes a cluster first, if it exists +kubetest --up + +# Run all tests +kubetest --test + +# Run tests matching the regex "\[Feature:Performance\]" against a local cluster +# Specify "--provider=local" flag when running the tests locally +kubetest --test --test_args="--ginkgo.focus=\[Feature:Performance\]" --provider=local + +# Conversely, exclude tests that match the regex "Pods.*env" +kubetest --test --test_args="--ginkgo.skip=Pods.*env" + +# Run tests in parallel, skip any that must be run serially +GINKGO_PARALLEL=y kubetest --test --test_args="--ginkgo.skip=\[Serial\]" + +# Run tests in parallel, skip any that must be run serially and keep the test namespace if test failed +GINKGO_PARALLEL=y kubetest --test --test_args="--ginkgo.skip=\[Serial\] --delete-namespace-on-failure=false" + +# Flags can be combined, and their actions will take place in this order: +# --build, --up, --test, --down +# +# You can also specify an alternative provider, such as 'aws' +# +# e.g.: +kubetest --provider=aws --build --up --test --down + +# -ctl can be used to quickly call kubectl against your e2e cluster. Useful for +# cleaning up after a failed test or viewing logs. +# kubectl output is default on, you can use --verbose-commands=false to suppress output. +kubetest -ctl='get events' +kubetest -ctl='delete pod foobar' +``` + +The tests are built into a single binary which can be used to deploy a +Kubernetes system or run tests against an already-deployed Kubernetes system. +See `kubetest --help` (or the flag definitions in `hack/e2e.go`) for +more options, such as reusing an existing cluster. + +### Cleaning up + +During a run, pressing `control-C` should result in an orderly shutdown, but if +something goes wrong and you still have some VMs running you can force a cleanup +with this command: + +```sh +kubetest --down +``` + +## Advanced testing + +### Extracting a specific version of kubernetes + +The `kubetest` binary can download and extract a specific version of kubernetes, +both the server, client and test binaries. The `--extract=E` flag enables this +functionality. + +There are a variety of values to pass this flag: + +```sh +# Official builds: <ci|release>/<latest|stable>[-N.N] +kubetest --extract=ci/latest --up # Deploy the latest ci build. +kubetest --extract=ci/latest-1.5 --up # Deploy the latest 1.5 CI build. +kubetest --extract=release/latest --up # Deploy the latest RC. +kubetest --extract=release/stable-1.5 --up # Deploy the 1.5 release. + +# A specific version: +kubetest --extract=v1.5.1 --up # Deploy 1.5.1 +kubetest --extract=v1.5.2-beta.0 --up # Deploy 1.5.2-beta.0 +kubetest --extract=gs://foo/bar --up # --stage=gs://foo/bar + +# Whatever GKE is using (gke, gke-staging, gke-test): +kubetest --extract=gke --up # Deploy whatever GKE prod uses + +# Using a GCI version: +kubetest --extract=gci/gci-canary --up # Deploy the version for next gci release +kubetest --extract=gci/gci-57 # Deploy the version bound to gci m57 +kubetest --extract=gci/gci-57/ci/latest # Deploy the latest CI build using gci m57 for the VM image + +# Reuse whatever is already built +kubetest --up # Most common. Note, no extract flag +kubetest --build --up # Most common. Note, no extract flag +kubetest --build --stage=gs://foo/bar --extract=local --up # Extract the staged version +``` + +### Bringing up a cluster for testing + +If you want, you may bring up a cluster in some other manner and run tests +against it. To do so, or to do other non-standard test things, you can pass +arguments into Ginkgo using `--test_args` (e.g. see above). For the purposes of +brevity, we will look at a subset of the options, which are listed below: + +``` +--ginkgo.dryRun=false: If set, ginkgo will walk the test hierarchy without +actually running anything. + +--ginkgo.failFast=false: If set, ginkgo will stop running a test suite after a +failure occurs. + +--ginkgo.failOnPending=false: If set, ginkgo will mark the test suite as failed +if any specs are pending. + +--ginkgo.focus="": If set, ginkgo will only run specs that match this regular +expression. + +--ginkgo.noColor="n": If set to "y", ginkgo will not use color in the output + +--ginkgo.skip="": If set, ginkgo will only run specs that do not match this +regular expression. + +--ginkgo.trace=false: If set, default reporter prints out the full stack trace +when a failure occurs + +--ginkgo.v=false: If set, default reporter print out all specs as they begin. + +--host="": The host, or api-server, to connect to + +--kubeconfig="": Path to kubeconfig containing embedded authinfo. + +--provider="": The name of the Kubernetes provider (gce, gke, local, vagrant, +etc.) + +--repo-root="../../": Root directory of kubernetes repository, for finding test +files. +``` + +Prior to running the tests, you may want to first create a simple auth file in +your home directory, e.g. `$HOME/.kube/config`, with the following: + +``` +{ + "User": "root", + "Password": "" +} +``` + +As mentioned earlier there are a host of other options that are available, but +they are left to the developer. + +**NOTE:** If you are running tests on a local cluster repeatedly, you may need +to periodically perform some manual cleanup: + + - `rm -rf /var/run/kubernetes`, clear kube generated credentials, sometimes +stale permissions can cause problems. + + - `sudo iptables -F`, clear ip tables rules left by the kube-proxy. + +### Reproducing failures in flaky tests +You can run a test repeatedly until it fails. This is useful when debugging +flaky tests. In order to do so, you need to set the following environment +variable: +```sh +$ export GINKGO_UNTIL_IT_FAILS=true +``` + +After setting the environment variable, you can run the tests as before. The e2e +script adds `--untilItFails=true` to ginkgo args if the environment variable is +set. The flags asks ginkgo to run the test repeatedly until it fails. + +### Federation e2e tests + +By default, `e2e.go` provisions a single Kubernetes cluster, and any `Feature:Federation` ginkgo tests will be skipped. + +Federation e2e testing involve bringing up multiple "underlying" Kubernetes clusters, +and deploying the federation control plane as a Kubernetes application on the underlying clusters. + +The federation e2e tests are still managed via `e2e.go`, but require some extra configuration items. + +#### Configuring federation e2e tests + +The following environment variables will enable federation e2e building, provisioning and testing. + +```sh +$ export FEDERATION=true +$ export E2E_ZONES="us-central1-a us-central1-b us-central1-f" +``` + +A Kubernetes cluster will be provisioned in each zone listed in `E2E_ZONES`. A zone can only appear once in the `E2E_ZONES` list. + +#### Image Push Repository + +Next, specify the docker repository where your ci images will be pushed. + +* **If `--provider=gce` or `--provider=gke`**: + + If you use the same GCP project where you to run the e2e tests as the container image repository, + FEDERATION_PUSH_REPO_BASE environment variable will be defaulted to "gcr.io/${DEFAULT_GCP_PROJECT_NAME}". + You can skip ahead to the **Build** section. + + You can simply set your push repo base based on your project name, and the necessary repositories will be + auto-created when you first push your container images. + + ```sh + $ export FEDERATION_PUSH_REPO_BASE="gcr.io/${GCE_PROJECT_NAME}" + ``` + + Skip ahead to the **Build** section. + +* **For all other providers**: + + You'll be responsible for creating and managing access to the repositories manually. + + ```sh + $ export FEDERATION_PUSH_REPO_BASE="quay.io/colin_hom" + ``` + + Given this example, the `federation-apiserver` container image will be pushed to the repository + `quay.io/colin_hom/federation-apiserver`. + + The docker client on the machine running `e2e.go` must have push access for the following pre-existing repositories: + + * `${FEDERATION_PUSH_REPO_BASE}/federation-apiserver` + * `${FEDERATION_PUSH_REPO_BASE}/federation-controller-manager` + + These repositories must allow public read access, as the e2e node docker daemons will not have any credentials. If you're using + GCE/GKE as your provider, the repositories will have read-access by default. + +#### Build + +* Compile the binaries and build container images: + + ```sh + $ KUBE_RELEASE_RUN_TESTS=n KUBE_FASTBUILD=true kubetest -build + ``` + +* Push the federation container images + + ```sh + $ federation/develop/push-federation-images.sh + ``` + +#### Deploy federation control plane + +The following command will create the underlying Kubernetes clusters in each of `E2E_ZONES`, and then provision the +federation control plane in the cluster occupying the last zone in the `E2E_ZONES` list. + +```sh +$ kubetest --up +``` + +#### Run the Tests + +This will run only the `Feature:Federation` e2e tests. You can omit the `ginkgo.focus` argument to run the entire e2e suite. + +```sh +$ kubetest --test --test_args="--ginkgo.focus=\[Feature:Federation\]" +``` + +#### Teardown + +```sh +$ kubetest --down +``` + +#### Shortcuts for test developers + +* To speed up `--up`, provision a single-node kubernetes cluster in a single e2e zone: + + `NUM_NODES=1 E2E_ZONES="us-central1-f"` + + Keep in mind that some tests may require multiple underlying clusters and/or minimum compute resource availability. + +* If you're hacking around with the federation control plane deployment itself, + you can quickly re-deploy the federation control plane Kubernetes manifests without tearing any resources down. + To re-deploy the federation control plane after running `--up` for the first time: + + ```sh + $ federation/cluster/federation-up.sh + ``` + +### Debugging clusters + +If a cluster fails to initialize, or you'd like to better understand cluster +state to debug a failed e2e test, you can use the `cluster/log-dump.sh` script +to gather logs. + +This script requires that the cluster provider supports ssh. Assuming it does, +running: + +```sh +$ federation/cluster/log-dump.sh <directory> +``` + +will ssh to the master and all nodes and download a variety of useful logs to +the provided directory (which should already exist). + +The Google-run Jenkins builds automatically collected these logs for every +build, saving them in the `artifacts` directory uploaded to GCS. + +### Local clusters + +It can be much faster to iterate on a local cluster instead of a cloud-based +one. To start a local cluster, you can run: + +```sh +# The PATH construction is needed because PATH is one of the special-cased +# environment variables not passed by sudo -E +sudo PATH=$PATH hack/local-up-cluster.sh +``` + +This will start a single-node Kubernetes cluster than runs pods using the local +docker daemon. Press Control-C to stop the cluster. + +You can generate a valid kubeconfig file by following instructions printed at the +end of aforementioned script. + +#### Testing against local clusters + +In order to run an E2E test against a locally running cluster, first make sure +to have a local build of the tests: + +```sh +kubetest --build +``` + +Then point the tests at a custom host directly: + +```sh +export KUBECONFIG=/path/to/kubeconfig +kubetest --provider=local --test +``` + +To control the tests that are run: + +```sh +kubetest --provider=local --test --test_args="--ginkgo.focus=Secrets" +``` + +You will also likely need to specify `minStartupPods` to match the number of +nodes in your cluster. If you're testing against a cluster set up by +`local-up-cluster.sh`, you will need to do the following: + +```sh +kubetest --provider=local --test --test_args="--minStartupPods=1 --ginkgo.focus=Secrets" +``` + +### Version-skewed and upgrade testing + +We run version-skewed tests to check that newer versions of Kubernetes work +similarly enough to older versions. The general strategy is to cover the following cases: + +1. One version of `kubectl` with another version of the cluster and tests (e.g. + that v1.2 and v1.4 `kubectl` doesn't break v1.3 tests running against a v1.3 + cluster). +1. A newer version of the Kubernetes master with older nodes and tests (e.g. + that upgrading a master to v1.3 with nodes at v1.2 still passes v1.2 tests). +1. A newer version of the whole cluster with older tests (e.g. that a cluster + upgraded---master and nodes---to v1.3 still passes v1.2 tests). +1. That an upgraded cluster functions the same as a brand-new cluster of the + same version (e.g. a cluster upgraded to v1.3 passes the same v1.3 tests as + a newly-created v1.3 cluster). + +[kubetest](https://git.k8s.io/test-infra/kubetest) is +the authoritative source on how to run version-skewed tests, but below is a +quick-and-dirty tutorial. + +```sh +# Assume you have two copies of the Kubernetes repository checked out, at +# ./kubernetes and ./kubernetes_old + +# If using GKE: +export CLUSTER_API_VERSION=${OLD_VERSION} + +# Deploy a cluster at the old version; see above for more details +cd ./kubernetes_old +kubetest --up + +# Upgrade the cluster to the new version +# +# If using GKE, add --upgrade-target=${NEW_VERSION} +# +# You can target Feature:MasterUpgrade or Feature:ClusterUpgrade +cd ../kubernetes +kubetest --provider=gke --test --check-version-skew=false --test_args="--ginkgo.focus=\[Feature:MasterUpgrade\]" + +# Run old tests with new kubectl +cd ../kubernetes_old +kubetest --provider=gke --test --test_args="--kubectl-path=$(pwd)/../kubernetes/cluster/kubectl.sh" +``` + +If you are just testing version-skew, you may want to just deploy at one +version and then test at another version, instead of going through the whole +upgrade process: + +```sh +# With the same setup as above + +# Deploy a cluster at the new version +cd ./kubernetes +kubetest --up + +# Run new tests with old kubectl +kubetest --test --test_args="--kubectl-path=$(pwd)/../kubernetes_old/cluster/kubectl.sh" + +# Run old tests with new kubectl +cd ../kubernetes_old +kubetest --test --test_args="--kubectl-path=$(pwd)/../kubernetes/cluster/kubectl.sh" +``` + +#### Test jobs naming convention + +**Version skew tests** are named as +`<cloud-provider>-<master&node-version>-<kubectl-version>-<image-name>-kubectl-skew` +e.g: `gke-1.5-1.6-cvm-kubectl-skew` means cloud provider is GKE; +master and nodes are built from `release-1.5` branch; +`kubectl` is built from `release-1.6` branch; +image name is cvm (container_vm). +The test suite is always the older one in version skew tests. e.g. from release-1.5 in this case. + +**Upgrade tests**: + +If a test job name ends with `upgrade-cluster`, it means we first upgrade +the cluster (i.e. master and nodes) and then run the old test suite with new kubectl. + +If a test job name ends with `upgrade-cluster-new`, it means we first upgrade +the cluster (i.e. master and nodes) and then run the new test suite with new kubectl. + +If a test job name ends with `upgrade-master`, it means we first upgrade +the master and keep the nodes in old version and then run the old test suite with new kubectl. + +There are some examples in the table, +where `->` means upgrading; container_vm (cvm) and gci are image names. + +| test name | test suite | master version (image) | node version (image) | kubectl +| --------- | :--------: | :----: | :---:| :---: +| gce-1.5-1.6-upgrade-cluster | 1.5 | 1.5->1.6 | 1.5->1.6 | 1.6 +| gce-1.5-1.6-upgrade-cluster-new | 1.6 | 1.5->1.6 | 1.5->1.6 | 1.6 +| gce-1.5-1.6-upgrade-master | 1.5 | 1.5->1.6 | 1.5 | 1.6 +| gke-container_vm-1.5-container_vm-1.6-upgrade-cluster | 1.5 | 1.5->1.6 (cvm) | 1.5->1.6 (cvm) | 1.6 +| gke-gci-1.5-container_vm-1.6-upgrade-cluster-new | 1.6 | 1.5->1.6 (gci) | 1.5->1.6 (cvm) | 1.6 +| gke-gci-1.5-container_vm-1.6-upgrade-master | 1.5 | 1.5->1.6 (gci) | 1.5 (cvm) | 1.6 + +## Kinds of tests + +We are working on implementing clearer partitioning of our e2e tests to make +running a known set of tests easier (#10548). Tests can be labeled with any of +the following labels, in order of increasing precedence (that is, each label +listed below supersedes the previous ones): + + - If a test has no labels, it is expected to run fast (under five minutes), be +able to be run in parallel, and be consistent. + + - `[Slow]`: If a test takes more than five minutes to run (by itself or in +parallel with many other tests), it is labeled `[Slow]`. This partition allows +us to run almost all of our tests quickly in parallel, without waiting for the +stragglers to finish. + + - `[Serial]`: If a test cannot be run in parallel with other tests (e.g. it +takes too many resources or restarts nodes), it is labeled `[Serial]`, and +should be run in serial as part of a separate suite. + + - `[Disruptive]`: If a test restarts components that might cause other tests +to fail or break the cluster completely, it is labeled `[Disruptive]`. Any +`[Disruptive]` test is also assumed to qualify for the `[Serial]` label, but +need not be labeled as both. These tests are not run against soak clusters to +avoid restarting components. + + - `[Flaky]`: If a test is found to be flaky and we have decided that it's too +hard to fix in the short term (e.g. it's going to take a full engineer-week), it +receives the `[Flaky]` label until it is fixed. The `[Flaky]` label should be +used very sparingly, and should be accompanied with a reference to the issue for +de-flaking the test, because while a test remains labeled `[Flaky]`, it is not +monitored closely in CI. `[Flaky]` tests are by default not run, unless a +`focus` or `skip` argument is explicitly given. + + - `[Feature:.+]`: If a test has non-default requirements to run or targets +some non-core functionality, and thus should not be run as part of the standard +suite, it receives a `[Feature:.+]` label, e.g. `[Feature:Performance]` or +`[Feature:Ingress]`. `[Feature:.+]` tests are not run in our core suites, +instead running in custom suites. If a feature is experimental or alpha and is +not enabled by default due to being incomplete or potentially subject to +breaking changes, it does *not* block PR merges, and thus should run in +some separate test suites owned by the feature owner(s) +(see [Continuous Integration](#continuous-integration) below). + + - `[Conformance]`: Designate that this test is included in the Conformance +test suite for [Conformance Testing](conformance-tests.md). This test must +meet a number of [requirements](conformance-tests.md#conformance-test-requirements) +to be eligible for this tag. This tag does not supersed any other labels. + + - `[LinuxOnly]`: If a test is known to be using Linux-specific features +(e.g.: seLinuxOptions) or is unable to run on Windows nodes, it is labeled +`[LinuxOnly]`. When using Windows nodes, this tag should be added to the +`skip` argument. + + - The following tags are not considered to be exhaustively applied, but are +intended to further categorize existing `[Conformance]` tests, or tests that are +being considered as candidate for promotion to `[Conformance]` as we work to +refine requirements: + - `[Privileged]`: This is a test that requires privileged access + - `[Internet]`: This is a test that assumes access to the public internet + - `[Deprecated]`: This is a test that exercises a deprecated feature + - `[Alpha]`: This is a test that exercises an alpha feature + - `[Beta]`: This is a test that exercises a beta feature + +Every test should be owned by a [SIG](/sig-list.md), +and have a corresponding `[sig-<name>]` label. + +### Viper configuration and hierarchichal test parameters. + +The future of e2e test configuration idioms will be increasingly defined using viper, and decreasingly via flags. + +Flags in general fall apart once tests become sufficiently complicated. So, even if we could use another flag library, it wouldn't be ideal. + +To use viper, rather than flags, to configure your tests: + +- Just add "e2e.json" to the current directory you are in, and define parameters in it... i.e. `"kubeconfig":"/tmp/x"`. + +Note that advanced testing parameters, and hierarchichally defined parameters, are only defined in viper, to see what they are, you can dive into [TestContextType](https://git.k8s.io/kubernetes/test/e2e/framework/test_context.go). + +In time, it is our intent to add or autogenerate a sample viper configuration that includes all e2e parameters, to ship with kubernetes. + +### Conformance tests + +For more information on Conformance tests please see the [Conformance Testing](conformance-tests.md) + +## Continuous Integration + +A quick overview of how we run e2e CI on Kubernetes. + +### What is CI? + +We run a battery of [release-blocking jobs](https://k8s-testgrid.appspot.com/sig-release-master-blocking) +against `HEAD` of the master branch on a continuous basis, and block merges +via [Tide](https://git.k8s.io/test-infra/prow/cmd/tide) on a subset of those +tests if they fail. + +CI results can be found at [ci-test.k8s.io](http://ci-test.k8s.io), e.g. +[ci-test.k8s.io/kubernetes-e2e-gce/10594](http://ci-test.k8s.io/kubernetes-e2e-gce/10594). + +### What runs in CI? + +We run all default tests (those that aren't marked `[Flaky]` or `[Feature:.+]`) +against GCE and GKE. To minimize the time from regression-to-green-run, we +partition tests across different jobs: + + - `kubernetes-e2e-<provider>` runs all non-`[Slow]`, non-`[Serial]`, +non-`[Disruptive]`, non-`[Flaky]`, non-`[Feature:.+]` tests in parallel. + + - `kubernetes-e2e-<provider>-slow` runs all `[Slow]`, non-`[Serial]`, +non-`[Disruptive]`, non-`[Flaky]`, non-`[Feature:.+]` tests in parallel. + + - `kubernetes-e2e-<provider>-serial` runs all `[Serial]` and `[Disruptive]`, +non-`[Flaky]`, non-`[Feature:.+]` tests in serial. + +We also run non-default tests if the tests exercise general-availability ("GA") +features that require a special environment to run in, e.g. +`kubernetes-e2e-gce-scalability` and `kubernetes-kubemark-gce`, which test for +Kubernetes performance. + +#### Non-default tests + +Many `[Feature:.+]` tests we don't run in CI. These tests are for features that +are experimental (often in the `experimental` API), and aren't enabled by +default. + +### The PR-builder + +We also run a battery of tests against every PR before we merge it. These tests +are equivalent to `kubernetes-gce`: it runs all non-`[Slow]`, non-`[Serial]`, +non-`[Disruptive]`, non-`[Flaky]`, non-`[Feature:.+]` tests in parallel. These +tests are considered "smoke tests" to give a decent signal that the PR doesn't +break most functionality. Results for your PR can be found at +[pr-test.k8s.io](http://pr-test.k8s.io), e.g. +[pr-test.k8s.io/20354](http://pr-test.k8s.io/20354) for #20354. + +### Adding a test to CI + +As mentioned above, prior to adding a new test, it is a good idea to perform a +`-ginkgo.dryRun=true` on the system, in order to see if a behavior is already +being tested, or to determine if it may be possible to augment an existing set +of tests for a specific use case. + +If a behavior does not currently have coverage and a developer wishes to add a +new e2e test, navigate to the ./test/e2e directory and create a new test using +the existing suite as a guide. + +**NOTE:** To build/run with tests in a new directory within ./test/e2e, add the +directory to import list in ./test/e2e/e2e_test.go + +TODO(#20357): Create a self-documented example which has been disabled, but can +be copied to create new tests and outlines the capabilities and libraries used. + +When writing a test, consult #kinds-of-tests above to determine how your test +should be marked, (e.g. `[Slow]`, `[Serial]`; remember, by default we assume a +test can run in parallel with other tests!). + +When first adding a test it should *not* go straight into CI, because failures +block ordinary development. A test should only be added to CI after is has been +running in some non-CI suite long enough to establish a track record showing +that the test does not fail when run against *working* software. Note also that +tests running in CI are generally running on a well-loaded cluster, so must +contend for resources; see above about [kinds of tests](#kinds_of_tests). + +Generally, a feature starts as `experimental`, and will be run in some suite +owned by the team developing the feature. If a feature is in beta or GA, it +*should* block PR merges and releases. In moving from experimental to beta or GA, tests +that are expected to pass by default should simply remove the `[Feature:.+]` +label, and will be incorporated into our core suites. If tests are not expected +to pass by default, (e.g. they require a special environment such as added +quota,) they should remain with the `[Feature:.+]` label. + +Occasionally, we'll want to add tests to better exercise features that are +already GA. These tests also shouldn't go straight to CI. They should begin by +being marked as `[Flaky]` to be run outside of CI, and once a track-record for +them is established, they may be promoted out of `[Flaky]`. + +### Moving a test out of CI + +If we have determined that a test is known-flaky and cannot be fixed in the +short-term, we may move it out of CI indefinitely. This move should be used +sparingly, as it effectively means that we have no coverage of that test. When a +test is demoted, it should be marked `[Flaky]` with a comment accompanying the +label with a reference to an issue opened to fix the test. + +## Performance Evaluation + +Another benefit of the e2e tests is the ability to create reproducible loads on +the system, which can then be used to determine the responsiveness, or analyze +other characteristics of the system. For example, the density tests load the +system to 30,50,100 pods per/node and measures the different characteristics of +the system, such as throughput, api-latency, etc. + +For a good overview of how we analyze performance data, please read the +following [post](https://kubernetes.io/blog/2015/09/kubernetes-performance-measurements-and/) + +For developers who are interested in doing their own performance analysis, we +recommend setting up [prometheus](http://prometheus.io/) for data collection, +and using [grafana](https://prometheus.io/docs/visualization/grafana/) to +visualize the data. There also exists the option of pushing your own metrics in +from the tests using a +[prom-push-gateway](http://prometheus.io/docs/instrumenting/pushing/). +Containers for all of these components can be found +[here](https://hub.docker.com/u/prom/). + +For more accurate measurements, you may wish to set up prometheus external to +kubernetes in an environment where it can access the major system components +(api-server, controller-manager, scheduler). This is especially useful when +attempting to gather metrics in a load-balanced api-server environment, because +all api-servers can be analyzed independently as well as collectively. On +startup, configuration file is passed to prometheus that specifies the endpoints +that prometheus will scrape, as well as the sampling interval. + +``` +#prometheus.conf +job: { + name: "kubernetes" + scrape_interval: "1s" + target_group: { + # apiserver(s) + target: "http://localhost:8080/metrics" + # scheduler + target: "http://localhost:10251/metrics" + # controller-manager + target: "http://localhost:10252/metrics" + } +} +``` + +Once prometheus is scraping the kubernetes endpoints, that data can then be +plotted using promdash, and alerts can be created against the assortment of +metrics that kubernetes provides. + +## One More Thing + +You should also know the [testing conventions](../guide/coding-conventions.md#testing-conventions). + +**HAPPY TESTING!** diff --git a/contributors/devel/sig-testing/flaky-tests.md b/contributors/devel/sig-testing/flaky-tests.md new file mode 100644 index 00000000..14302592 --- /dev/null +++ b/contributors/devel/sig-testing/flaky-tests.md @@ -0,0 +1,201 @@ +# Flaky tests + +Any test that fails occasionally is "flaky". Since our merges only proceed when +all tests are green, and we have a number of different CI systems running the +tests in various combinations, even a small percentage of flakes results in a +lot of pain for people waiting for their PRs to merge. + +Therefore, it's very important that we write tests defensively. Situations that +"almost never happen" happen with some regularity when run thousands of times in +resource-constrained environments. Since flakes can often be quite hard to +reproduce while still being common enough to block merges occasionally, it's +additionally important that the test logs be useful for narrowing down exactly +what caused the failure. + +Note that flakes can occur in unit tests, integration tests, or end-to-end +tests, but probably occur most commonly in end-to-end tests. + +## Hunting Flakes + +You may notice lots of your PRs or ones you watch are having a common +pre-submit failure, but less frequent issues that are still of concern take +more analysis over time. There are metrics recorded and viewable in: +- [TestGrid](https://k8s-testgrid.appspot.com/presubmits-kubernetes-blocking#Summary) +- [Velodrome](http://velodrome.k8s.io/dashboard/db/bigquery-metrics?orgId=1) + +It is worth noting tests are going to fail in presubmit a lot due +to unbuildable code, but that wont happen as much on the same commit unless +there's a true issue in the code or a broader problem like a dep failed to +pull in. + +## Filing issues for flaky tests + +Because flakes may be rare, it's very important that all relevant logs be +discoverable from the issue. + +1. Search for the test name. If you find an open issue and you're 90% sure the + flake is exactly the same, add a comment instead of making a new issue. +2. If you make a new issue, you should title it with the test name, prefixed by + "e2e/unit/integration flake:" (whichever is appropriate) +3. Reference any old issues you found in step one. Also, make a comment in the + old issue referencing your new issue, because people monitoring only their + email do not see the backlinks github adds. Alternatively, tag the person or + people who most recently worked on it. +4. Paste, in block quotes, the entire log of the individual failing test, not + just the failure line. +5. Link to durable storage with the rest of the logs. This means (for all the + tests that Google runs) the GCS link is mandatory! The Jenkins test result + link is nice but strictly optional: not only does it expire more quickly, + it's not accessible to non-Googlers. + +## Finding failed flaky test cases + +Find flaky tests issues on GitHub under the [kind/flake issue label][flake]. +There are significant numbers of flaky tests reported on a regular basis and P2 +flakes are under-investigated. Fixing flakes is a quick way to gain expertise +and community goodwill. + +[flake]: https://github.com/kubernetes/kubernetes/issues?q=is%3Aopen+is%3Aissue+label%3Akind%2Fflake + +## Expectations when a flaky test is assigned to you + +Note that we won't randomly assign these issues to you unless you've opted in or +you're part of a group that has opted in. We are more than happy to accept help +from anyone in fixing these, but due to the severity of the problem when merges +are blocked, we need reasonably quick turn-around time on test flakes. Therefore +we have the following guidelines: + +1. If a flaky test is assigned to you, it's more important than anything else + you're doing unless you can get a special dispensation (in which case it will + be reassigned). If you have too many flaky tests assigned to you, or you + have such a dispensation, then it's *still* your responsibility to find new + owners (this may just mean giving stuff back to the relevant Team or SIG Lead). +2. You should make a reasonable effort to reproduce it. Somewhere between an + hour and half a day of concentrated effort is "reasonable". It is perfectly + reasonable to ask for help! +3. If you can reproduce it (or it's obvious from the logs what happened), you + should then be able to fix it, or in the case where someone is clearly more + qualified to fix it, reassign it with very clear instructions. +4. Once you have made a change that you believe fixes a flake, it is conservative + to keep the issue for the flake open and see if it manifests again after the + change is merged. +5. If you can't reproduce a flake: __don't just close it!__ Every time a flake comes + back, at least 2 hours of merge time is wasted. So we need to make monotonic + progress towards narrowing it down every time a flake occurs. If you can't + figure it out from the logs, add log messages that would have help you figure + it out. If you make changes to make a flake more reproducible, please link + your pull request to the flake you're working on. +6. If a flake has been open, could not be reproduced, and has not manifested in + 3 months, it is reasonable to close the flake issue with a note saying + why. + +# Reproducing unit test flakes + +Try the [stress command](https://godoc.org/golang.org/x/tools/cmd/stress). + +Just + +``` +$ go install golang.org/x/tools/cmd/stress +``` + +Then build your test binary + +``` +$ go test -c -race +``` + +Then run it under stress + +``` +$ stress ./package.test -test.run=FlakyTest +``` + +It runs the command and writes output to `/tmp/gostress-*` files when it fails. +It periodically reports with run counts. Be careful with tests that use the +`net/http/httptest` package; they could exhaust the available ports on your +system! + +# Hunting flaky unit tests in Kubernetes + +Sometimes unit tests are flaky. This means that due to (usually) race +conditions, they will occasionally fail, even though most of the time they pass. + +We have a goal of 99.9% flake free tests. This means that there is only one +flake in one thousand runs of a test. + +Running a test 1000 times on your own machine can be tedious and time consuming. +Fortunately, there is a better way to achieve this using Kubernetes. + +_Note: these instructions are mildly hacky for now, as we get run once semantics +and logging they will get better_ + +There is a testing image `brendanburns/flake` up on the docker hub. We will use +this image to test our fix. + +Create a replication controller with the following config: + +```yaml +apiVersion: v1 +kind: ReplicationController +metadata: + name: flakecontroller +spec: + replicas: 24 + template: + metadata: + labels: + name: flake + spec: + containers: + - name: flake + image: brendanburns/flake + env: + - name: TEST_PACKAGE + value: pkg/tools + - name: REPO_SPEC + value: https://github.com/kubernetes/kubernetes +``` + +Note that we omit the labels and the selector fields of the replication +controller, because they will be populated from the labels field of the pod +template by default. + +```sh +kubectl create -f ./controller.yaml +``` + +This will spin up 24 instances of the test. They will run to completion, then +exit, and the kubelet will restart them, accumulating more and more runs of the +test. + +You can examine the recent runs of the test by calling `docker ps -a` and +looking for tasks that exited with non-zero exit codes. Unfortunately, docker +ps -a only keeps around the exit status of the last 15-20 containers with the +same image, so you have to check them frequently. + +You can use this script to automate checking for failures, assuming your cluster +is running on GCE and has four nodes: + +```sh +echo "" > output.txt +for i in {1..4}; do + echo "Checking kubernetes-node-${i}" + echo "kubernetes-node-${i}:" >> output.txt + gcloud compute ssh "kubernetes-node-${i}" --command="sudo docker ps -a" >> output.txt +done +grep "Exited ([^0])" output.txt +``` + +Eventually you will have sufficient runs for your purposes. At that point you +can delete the replication controller by running: + +```sh +kubectl delete replicationcontroller flakecontroller +``` + +If you do a final check for flakes with `docker ps -a`, ignore tasks that +exited -1, since that's what happens when you stop the replication controller. + +Happy flake hunting! + diff --git a/contributors/devel/sig-testing/gubernator.md b/contributors/devel/sig-testing/gubernator.md new file mode 100644 index 00000000..b03d11a1 --- /dev/null +++ b/contributors/devel/sig-testing/gubernator.md @@ -0,0 +1,136 @@ +# Gubernator + +*This document is oriented at developers who want to use Gubernator to debug while developing for Kubernetes.* + + +- [Gubernator](#gubernator) + - [What is Gubernator?](#what-is-gubernator) + - [Gubernator Features](#gubernator-features) + - [Test Failures list](#test-failures-list) + - [Log Filtering](#log-filtering) + - [Gubernator for Local Tests](#gubernator-for-local-tests) + - [Future Work](#future-work) + + +## What is Gubernator? + +[Gubernator](https://k8s-gubernator.appspot.com/) is a webpage for viewing and filtering Kubernetes +test results. + +Gubernator simplifies the debugging process and makes it easier to track down failures by automating many +steps commonly taken in searching through logs, and by offering tools to filter through logs to find relevant lines. +Gubernator automates the steps of finding the failed tests, displaying relevant logs, and determining the +failed pods and the corresponding pod UID, namespace, and container ID. +It also allows for filtering of the log files to display relevant lines based on selected keywords, and +allows for multiple logs to be woven together by timestamp. + +Gubernator runs on Google App Engine and fetches logs stored on Google Cloud Storage. + +## Gubernator Features + +### Test Failures list + +Comments made by k8s-ci-robot will post a link to a page listing the failed tests. +Each failed test comes with the corresponding error log from a junit file and a link +to filter logs for that test. + +Based on the message logged in the junit file, the pod name may be displayed. + + + +[Test Failures List Example](https://k8s-gubernator.appspot.com/build/kubernetes-jenkins/logs/kubernetes-e2e-gke/11721) + +### Log Filtering + +The log filtering page comes with checkboxes and textboxes to aid in filtering. Filtered keywords will be bolded +and lines including keywords will be highlighted. Up to four lines around the line of interest will also be displayed. + + + +If less than 100 lines are skipped, the "... skipping xx lines ..." message can be clicked to expand and show +the hidden lines. + +Before expansion: + +After expansion: + + +If the pod name was displayed in the Test Failures list, it will automatically be included in the filters. +If it is not found in the error message, it can be manually entered into the textbox. Once a pod name +is entered, the Pod UID, Namespace, and ContainerID may be automatically filled in as well. These can be +altered as well. To apply the filter, check off the options corresponding to the filter. + + + +To add a filter, type the term to be filtered into the textbox labeled "Add filter:" and press enter. +Additional filters will be displayed as checkboxes under the textbox. + + + +To choose which logs to view check off the checkboxes corresponding to the logs of interest. If multiple logs are +included, the "Weave by timestamp" option can weave the selected logs together based on the timestamp in each line. + + + +[Log Filtering Example 1](https://k8s-gubernator.appspot.com/build/kubernetes-jenkins/logs/kubelet-gce-e2e-ci/5535/nodelog?pod=pod-configmaps-b5b876cb-3e1e-11e6-8956-42010af0001d&junit=junit_03.xml&wrap=on&logfiles=%2Fkubernetes-jenkins%2Flogs%2Fkubelet-gce-e2e-ci%2F5535%2Fartifacts%2Ftmp-node-e2e-7a5a3b40-e2e-node-coreos-stable20160622-image%2Fkube-apiserver.log&logfiles=%2Fkubernetes-jenkins%2Flogs%2Fkubelet-gce-e2e-ci%2F5535%2Fartifacts%2Ftmp-node-e2e-7a5a3b40-e2e-node-coreos-stable20160622-image%2Fkubelet.log&UID=on&poduid=b5b8a59e-3e1e-11e6-b358-42010af0001d&ns=e2e-tests-configmap-oi12h&cID=tmp-node-e2e-7a5a3b40-e2e-node-coreos-stable20160622-image) + +[Log Filtering Example 2](https://k8s-gubernator.appspot.com/build/kubernetes-jenkins/logs/kubernetes-e2e-gke/11721/nodelog?pod=client-containers-a53f813c-503e-11e6-88dd-0242ac110003&junit=junit_19.xml&wrap=on) + + +### Gubernator for Local Tests + +*Currently Gubernator can only be used with remote node e2e tests.* + +**NOTE: Using Gubernator with local tests will publicly upload your test logs to Google Cloud Storage** + +To use Gubernator to view logs from local test runs, set the GUBERNATOR tag to true. +A URL link to view the test results will be printed to the console. +Please note that running with the Gubernator tag will bypass the user confirmation for uploading to GCS. + +```console + +$ make test-e2e-node REMOTE=true GUBERNATOR=true +... +================================================================ +Running gubernator.sh + +Gubernator linked below: +k8s-gubernator.appspot.com/build/yourusername-g8r-logs/logs/e2e-node/timestamp +``` + +The gubernator.sh script can be run after running a remote node e2e test for the same effect. + +```console +$ ./test/e2e_node/gubernator.sh +Do you want to run gubernator.sh and upload logs publicly to GCS? [y/n]y +... +Gubernator linked below: +k8s-gubernator.appspot.com/build/yourusername-g8r-logs/logs/e2e-node/timestamp +``` + +## Future Work + +Gubernator provides a framework for debugging failures and introduces useful features. +There is still a lot of room for more features and growth to make the debugging process more efficient. + +How to contribute (see https://git.k8s.io/test-infra/gubernator/README.md) + +* Extend GUBERNATOR flag to all local tests + +* More accurate identification of pod name, container ID, etc. + * Change content of logged strings for failures to include more information + * Better regex in Gubernator + +* Automate discovery of more keywords + * Volume Name + * Disk Name + * Pod IP + +* Clickable API objects in the displayed lines in order to add them as filters + +* Construct story of pod's lifetime + * Have concise view of what a pod went through from when pod was started to failure + +* Improve UI + * Have separate folders of logs in rows instead of in one long column + * Improve interface for adding additional features (maybe instead of textbox and checkbox, have chips) diff --git a/contributors/devel/sig-testing/testing.md b/contributors/devel/sig-testing/testing.md new file mode 100644 index 00000000..60f83b53 --- /dev/null +++ b/contributors/devel/sig-testing/testing.md @@ -0,0 +1,227 @@ +# Testing guide + +**Table of Contents** + +- [Testing guide](#testing-guide) + - [Unit tests](#unit-tests) + - [Run all unit tests](#run-all-unit-tests) + - [Set go flags during unit tests](#set-go-flags-during-unit-tests) + - [Run unit tests from certain packages](#run-unit-tests-from-certain-packages) + - [Run specific unit test cases in a package](#run-specific-unit-test-cases-in-a-package) + - [Stress running unit tests](#stress-running-unit-tests) + - [Unit test coverage](#unit-test-coverage) + - [Benchmark unit tests](#benchmark-unit-tests) + - [Integration tests](#integration-tests) + - [Install etcd dependency](#install-etcd-dependency) + - [Etcd test data](#etcd-test-data) + - [Run integration tests](#run-integration-tests) + - [Run a specific integration test](#run-a-specific-integration-test) + - [End-to-End tests](#end-to-end-tests) + + +This assumes you already read the [development guide](development.md) to +install go, godeps, and configure your git client. All command examples are +relative to the `kubernetes` root directory. + +Before sending pull requests you should at least make sure your changes have +passed both unit and integration tests. + +Kubernetes only merges pull requests when unit, integration, and e2e tests are +passing, so it is often a good idea to make sure the e2e tests work as well. + +## Unit tests + +* Unit tests should be fully hermetic + - Only access resources in the test binary. +* All packages and any significant files require unit tests. +* The preferred method of testing multiple scenarios or input is + [table driven testing](https://github.com/golang/go/wiki/TableDrivenTests) + - Example: [TestNamespaceAuthorization](https://git.k8s.io/kubernetes/test/integration/auth/auth_test.go) +* Unit tests must pass on macOS and Windows platforms. + - Tests using linux-specific features must be skipped or compiled out. + - Skipped is better, compiled out is required when it won't compile. +* Concurrent unit test runs must pass. +* See [coding conventions](../guide/coding-conventions.md). + +### Run all unit tests + +`make test` is the entrypoint for running the unit tests that ensures that +`GOPATH` is set up correctly. If you have `GOPATH` set up correctly, you can +also just use `go test` directly. + +```sh +cd kubernetes +make test # Run all unit tests. +``` + +If any unit test fails with a timeout panic (see [#1594](https://github.com/kubernetes/community/issues/1594)) on the testing package, you can increase the `KUBE_TIMEOUT` value as shown below. + +```sh +make test KUBE_TIMEOUT="-timeout 300s" +``` + +### Set go flags during unit tests + +You can set [go flags](https://golang.org/cmd/go/) by setting the +`GOFLAGS` environment variable. + +### Run unit tests from certain packages + +`make test` accepts packages as arguments; the `k8s.io/kubernetes` prefix is +added automatically to these: + +```sh +make test WHAT=./pkg/api # run tests for pkg/api +``` + +To run multiple targets you need quotes: + +```sh +make test WHAT="./pkg/api ./pkg/kubelet" # run tests for pkg/api and pkg/kubelet +``` + +In a shell, it's often handy to use brace expansion: + +```sh +make test WHAT=./pkg/{api,kubelet} # run tests for pkg/api and pkg/kubelet +``` + +### Run specific unit test cases in a package + +You can set the test args using the `KUBE_TEST_ARGS` environment variable. +You can use this to pass the `-run` argument to `go test`, which accepts a +regular expression for the name of the test that should be run. + +```sh +# Runs TestValidatePod in pkg/api/validation with the verbose flag set +make test WHAT=./pkg/api/validation GOFLAGS="-v" KUBE_TEST_ARGS='-run ^TestValidatePod$' + +# Runs tests that match the regex ValidatePod|ValidateConfigMap in pkg/api/validation +make test WHAT=./pkg/api/validation GOFLAGS="-v" KUBE_TEST_ARGS="-run ValidatePod\|ValidateConfigMap$" +``` + +For other supported test flags, see the [golang +documentation](https://golang.org/cmd/go/#hdr-Testing_flags). + +### Stress running unit tests + +Running the same tests repeatedly is one way to root out flakes. +You can do this efficiently. + +```sh +# Have 2 workers run all tests 5 times each (10 total iterations). +make test PARALLEL=2 ITERATION=5 +``` + +For more advanced ideas please see [flaky-tests.md](flaky-tests.md). + +### Unit test coverage + +Currently, collecting coverage is only supported for the Go unit tests. + +To run all unit tests and generate an HTML coverage report, run the following: + +```sh +make test KUBE_COVER=y +``` + +At the end of the run, an HTML report will be generated with the path +printed to stdout. + +To run tests and collect coverage in only one package, pass its relative path +under the `kubernetes` directory as an argument, for example: + +```sh +make test WHAT=./pkg/kubectl KUBE_COVER=y +``` + +Multiple arguments can be passed, in which case the coverage results will be +combined for all tests run. + +### Benchmark unit tests + +To run benchmark tests, you'll typically use something like: + +```sh +go test ./pkg/apiserver -benchmem -run=XXX -bench=BenchmarkWatch +``` + +This will do the following: + +1. `-run=XXX` is a regular expression filter on the name of test cases to run +2. `-bench=BenchmarkWatch` will run test methods with BenchmarkWatch in the name + * See `grep -nr BenchmarkWatch .` for examples +3. `-benchmem` enables memory allocation stats + +See `go help test` and `go help testflag` for additional info. + +## Integration tests + +* Integration tests should only access other resources on the local machine + - Most commonly etcd or a service listening on localhost. +* All significant features require integration tests. + - This includes kubectl commands +* The preferred method of testing multiple scenarios or inputs +is [table driven testing](https://github.com/golang/go/wiki/TableDrivenTests) + - Example: [TestNamespaceAuthorization](https://git.k8s.io/kubernetes/test/integration/auth/auth_test.go) +* Each test should create its own master, httpserver and config. + - Example: [TestPodUpdateActiveDeadlineSeconds](https://git.k8s.io/kubernetes/test/integration/pods/pods_test.go) +* See [coding conventions](coding-conventions.md). + +### Install etcd dependency + +Kubernetes integration tests require your `PATH` to include an +[etcd](https://github.com/coreos/etcd/releases) installation. Kubernetes +includes a script to help install etcd on your machine. + +```sh +# Install etcd and add to PATH + +# Option a) install inside kubernetes root +hack/install-etcd.sh # Installs in ./third_party/etcd +echo export PATH="\$PATH:$(pwd)/third_party/etcd" >> ~/.profile # Add to PATH + +# Option b) install manually +grep -E "image.*etcd" cluster/gce/manifests/etcd.manifest # Find version +# Install that version using yum/apt-get/etc +echo export PATH="\$PATH:<LOCATION>" >> ~/.profile # Add to PATH +``` + +### Etcd test data + +Many tests start an etcd server internally, storing test data in the operating system's temporary directory. + +If you see test failures because the temporary directory does not have sufficient space, +or is on a volume with unpredictable write latency, you can override the test data directory +for those internal etcd instances with the `TEST_ETCD_DIR` environment variable. + +### Run integration tests + +The integration tests are run using `make test-integration`. +The Kubernetes integration tests are written using the normal golang testing +package but expect to have a running etcd instance to connect to. The `test-integration.sh` +script wraps `make test` and sets up an etcd instance for the integration tests to use. + +```sh +make test-integration # Run all integration tests. +``` + +This script runs the golang tests in package +[`test/integration`](https://git.k8s.io/kubernetes/test/integration). + +### Run a specific integration test + +You can also use the `KUBE_TEST_ARGS` environment variable with the `make test-integration` +to run a specific integration test case: + +```sh +# Run integration test TestPodUpdateActiveDeadlineSeconds with the verbose flag set. +make test-integration WHAT=./test/integration/pods GOFLAGS="-v" KUBE_TEST_ARGS="-run ^TestPodUpdateActiveDeadlineSeconds$" +``` + +If you set `KUBE_TEST_ARGS`, the test case will be run with only the `v1` API +version and the watch cache test is skipped. + +## End-to-End tests + +Please refer to [End-to-End Testing in Kubernetes](e2e-tests.md). diff --git a/contributors/devel/sig-testing/writing-good-e2e-tests.md b/contributors/devel/sig-testing/writing-good-e2e-tests.md new file mode 100644 index 00000000..836479c2 --- /dev/null +++ b/contributors/devel/sig-testing/writing-good-e2e-tests.md @@ -0,0 +1,231 @@ +# Writing good e2e tests for Kubernetes # + +## Patterns and Anti-Patterns ## + +### Goals of e2e tests ### + +Beyond the obvious goal of providing end-to-end system test coverage, +there are a few less obvious goals that you should bear in mind when +designing, writing and debugging your end-to-end tests. In +particular, "flaky" tests, which pass most of the time but fail +intermittently for difficult-to-diagnose reasons are extremely costly +in terms of blurring our regression signals and slowing down our +automated merge velocity. Up-front time and effort designing your test +to be reliable is very well spent. Bear in mind that we have hundreds +of tests, each running in dozens of different environments, and if any +test in any test environment fails, we have to assume that we +potentially have some sort of regression. So if a significant number +of tests fail even only 1% of the time, basic statistics dictates that +we will almost never have a "green" regression indicator. Stated +another way, writing a test that is only 99% reliable is just about +useless in the harsh reality of a CI environment. In fact it's worse +than useless, because not only does it not provide a reliable +regression indicator, but it also costs a lot of subsequent debugging +time, and delayed merges. + +#### Debuggability #### + +If your test fails, it should provide as detailed as possible reasons +for the failure in its output. "Timeout" is not a useful error +message. "Timed out after 60 seconds waiting for pod xxx to enter +running state, still in pending state" is much more useful to someone +trying to figure out why your test failed and what to do about it. +Specifically, +[assertion](https://onsi.github.io/gomega/#making-assertions) code +like the following generates rather useless errors: + +``` +Expect(err).NotTo(HaveOccurred()) +``` + +Rather +[annotate](https://onsi.github.io/gomega/#annotating-assertions) your assertion with something like this: + +``` +Expect(err).NotTo(HaveOccurred(), "Failed to create %d foobars, only created %d", foobarsReqd, foobarsCreated) +``` + +On the other hand, overly verbose logging, particularly of non-error conditions, can make +it unnecessarily difficult to figure out whether a test failed and if +so why? So don't log lots of irrelevant stuff either. + +#### Ability to run in non-dedicated test clusters #### + +To reduce end-to-end delay and improve resource utilization when +running e2e tests, we try, where possible, to run large numbers of +tests in parallel against the same test cluster. This means that: + +1. you should avoid making any assumption (implicit or explicit) that +your test is the only thing running against the cluster. For example, +making the assumption that your test can run a pod on every node in a +cluster is not a safe assumption, as some other tests, running at the +same time as yours, might have saturated one or more nodes in the +cluster. Similarly, running a pod in the system namespace, and +assuming that will increase the count of pods in the system +namespace by one is not safe, as some other test might be creating or +deleting pods in the system namespace at the same time as your test. +If you do legitimately need to write a test like that, make sure to +label it ["\[Serial\]"](e2e-tests.md#kinds-of-tests) so that it's easy +to identify, and not run in parallel with any other tests. +1. You should avoid doing things to the cluster that make it difficult +for other tests to reliably do what they're trying to do, at the same +time. For example, rebooting nodes, disconnecting network interfaces, +or upgrading cluster software as part of your test is likely to +violate the assumptions that other tests might have made about a +reasonably stable cluster environment. If you need to write such +tests, please label them as +["\[Disruptive\]"](e2e-tests.md#kinds-of-tests) so that it's easy to +identify them, and not run them in parallel with other tests. +1. You should avoid making assumptions about the Kubernetes API that +are not part of the API specification, as your tests will break as +soon as these assumptions become invalid. For example, relying on +specific Events, Event reasons or Event messages will make your tests +very brittle. + +#### Speed of execution #### + +We have hundreds of e2e tests, some of which we run in serial, one +after the other, in some cases. If each test takes just a few minutes +to run, that very quickly adds up to many, many hours of total +execution time. We try to keep such total execution time down to a +few tens of minutes at most. Therefore, try (very hard) to keep the +execution time of your individual tests below 2 minutes, ideally +shorter than that. Concretely, adding inappropriately long 'sleep' +statements or other gratuitous waits to tests is a killer. If under +normal circumstances your pod enters the running state within 10 +seconds, and 99.9% of the time within 30 seconds, it would be +gratuitous to wait 5 minutes for this to happen. Rather just fail +after 30 seconds, with a clear error message as to why your test +failed ("e.g. Pod x failed to become ready after 30 seconds, it +usually takes 10 seconds"). If you do have a truly legitimate reason +for waiting longer than that, or writing a test which takes longer +than 2 minutes to run, comment very clearly in the code why this is +necessary, and label the test as +["\[Slow\]"](e2e-tests.md#kinds-of-tests), so that it's easy to +identify and avoid in test runs that are required to complete +timeously (for example those that are run against every code +submission before it is allowed to be merged). +Note that completing within, say, 2 minutes only when the test +passes is not generally good enough. Your test should also fail in a +reasonable time. We have seen tests that, for example, wait up to 10 +minutes for each of several pods to become ready. Under good +conditions these tests might pass within a few seconds, but if the +pods never become ready (e.g. due to a system regression) they take a +very long time to fail and typically cause the entire test run to time +out, so that no results are produced. Again, this is a lot less +useful than a test that fails reliably within a minute or two when the +system is not working correctly. + +#### Resilience to relatively rare, temporary infrastructure glitches or delays #### + +Remember that your test will be run many thousands of +times, at different times of day and night, probably on different +cloud providers, under different load conditions. And often the +underlying state of these systems is stored in eventually consistent +data stores. So, for example, if a resource creation request is +theoretically asynchronous, even if you observe it to be practically +synchronous most of the time, write your test to assume that it's +asynchronous (e.g. make the "create" call, and poll or watch the +resource until it's in the correct state before proceeding). +Similarly, don't assume that API endpoints are 100% available. +They're not. Under high load conditions, API calls might temporarily +fail or time-out. In such cases it's appropriate to back off and retry +a few times before failing your test completely (in which case make +the error message very clear about what happened, e.g. "Retried +http://... 3 times - all failed with xxx". Use the standard +retry mechanisms provided in the libraries detailed below. + +### Some concrete tools at your disposal ### + +Obviously most of the above goals apply to many tests, not just yours. +So we've developed a set of reusable test infrastructure, libraries +and best practices to help you to do the right thing, or at least do +the same thing as other tests, so that if that turns out to be the +wrong thing, it can be fixed in one place, not hundreds, to be the +right thing. + +Here are a few pointers: + ++ [E2e Framework](https://git.k8s.io/kubernetes/test/e2e/framework/framework.go): + Familiarise yourself with this test framework and how to use it. + Amongst others, it automatically creates uniquely named namespaces + within which your tests can run to avoid name clashes, and reliably + automates cleaning up the mess after your test has completed (it + just deletes everything in the namespace). This helps to ensure + that tests do not leak resources. Note that deleting a namespace + (and by implication everything in it) is currently an expensive + operation. So the fewer resources you create, the less cleaning up + the framework needs to do, and the faster your test (and other + tests running concurrently with yours) will complete. Your tests + should always use this framework. Trying other home-grown + approaches to avoiding name clashes and resource leaks has proven + to be a very bad idea. ++ [E2e utils library](https://git.k8s.io/kubernetes/test/e2e/framework/util.go): + This handy library provides tons of reusable code for a host of + commonly needed test functionality, including waiting for resources + to enter specified states, safely and consistently retrying failed + operations, usefully reporting errors, and much more. Make sure + that you're familiar with what's available there, and use it. + Likewise, if you come across a generally useful mechanism that's + not yet implemented there, add it so that others can benefit from + your brilliance. In particular pay attention to the variety of + timeout and retry related constants at the top of that file. Always + try to reuse these constants rather than try to dream up your own + values. Even if the values there are not precisely what you would + like to use (timeout periods, retry counts etc), the benefit of + having them be consistent and centrally configurable across our + entire test suite typically outweighs your personal preferences. ++ **Follow the examples of stable, well-written tests:** Some of our + existing end-to-end tests are better written and more reliable than + others. A few examples of well-written tests include: + [Replication Controllers](https://git.k8s.io/kubernetes/test/e2e/apps/rc.go), + [Services](https://git.k8s.io/kubernetes/test/e2e/network/service.go), + [Reboot](https://git.k8s.io/kubernetes/test/e2e/lifecycle/reboot.go). ++ [Ginkgo Test Framework](https://github.com/onsi/ginkgo): This is the + test library and runner upon which our e2e tests are built. Before + you write or refactor a test, read the docs and make sure that you + understand how it works. In particular be aware that every test is + uniquely identified and described (e.g. in test reports) by the + concatenation of its `Describe` clause and nested `It` clauses. + So for example `Describe("Pods",...).... It(""should be scheduled + with cpu and memory limits")` produces a sane test identifier and + descriptor `Pods should be scheduled with cpu and memory limits`, + which makes it clear what's being tested, and hence what's not + working if it fails. Other good examples include: + +``` + CAdvisor should be healthy on every node +``` + +and + +``` + Daemon set should run and stop complex daemon +``` + + On the contrary +(these are real examples), the following are less good test +descriptors: + +``` + KubeProxy should test kube-proxy +``` + +and + +``` +Nodes [Disruptive] Network when a node becomes unreachable +[replication controller] recreates pods scheduled on the +unreachable node AND allows scheduling of pods on a node after +it rejoins the cluster +``` + +An improvement might be + +``` +Unreachable nodes are evacuated and then repopulated upon rejoining [Disruptive] +``` + +Note that opening issues for specific better tooling is welcome, and +code implementing that tooling is even more welcome :-). + diff --git a/contributors/devel/testing.md b/contributors/devel/testing.md index 60f83b53..5bb42eeb 100644 --- a/contributors/devel/testing.md +++ b/contributors/devel/testing.md @@ -1,227 +1,3 @@ -# Testing guide +This file has moved to https://git.k8s.io/community/contributors/devel/sig-testing/testing.md. -**Table of Contents** - -- [Testing guide](#testing-guide) - - [Unit tests](#unit-tests) - - [Run all unit tests](#run-all-unit-tests) - - [Set go flags during unit tests](#set-go-flags-during-unit-tests) - - [Run unit tests from certain packages](#run-unit-tests-from-certain-packages) - - [Run specific unit test cases in a package](#run-specific-unit-test-cases-in-a-package) - - [Stress running unit tests](#stress-running-unit-tests) - - [Unit test coverage](#unit-test-coverage) - - [Benchmark unit tests](#benchmark-unit-tests) - - [Integration tests](#integration-tests) - - [Install etcd dependency](#install-etcd-dependency) - - [Etcd test data](#etcd-test-data) - - [Run integration tests](#run-integration-tests) - - [Run a specific integration test](#run-a-specific-integration-test) - - [End-to-End tests](#end-to-end-tests) - - -This assumes you already read the [development guide](development.md) to -install go, godeps, and configure your git client. All command examples are -relative to the `kubernetes` root directory. - -Before sending pull requests you should at least make sure your changes have -passed both unit and integration tests. - -Kubernetes only merges pull requests when unit, integration, and e2e tests are -passing, so it is often a good idea to make sure the e2e tests work as well. - -## Unit tests - -* Unit tests should be fully hermetic - - Only access resources in the test binary. -* All packages and any significant files require unit tests. -* The preferred method of testing multiple scenarios or input is - [table driven testing](https://github.com/golang/go/wiki/TableDrivenTests) - - Example: [TestNamespaceAuthorization](https://git.k8s.io/kubernetes/test/integration/auth/auth_test.go) -* Unit tests must pass on macOS and Windows platforms. - - Tests using linux-specific features must be skipped or compiled out. - - Skipped is better, compiled out is required when it won't compile. -* Concurrent unit test runs must pass. -* See [coding conventions](../guide/coding-conventions.md). - -### Run all unit tests - -`make test` is the entrypoint for running the unit tests that ensures that -`GOPATH` is set up correctly. If you have `GOPATH` set up correctly, you can -also just use `go test` directly. - -```sh -cd kubernetes -make test # Run all unit tests. -``` - -If any unit test fails with a timeout panic (see [#1594](https://github.com/kubernetes/community/issues/1594)) on the testing package, you can increase the `KUBE_TIMEOUT` value as shown below. - -```sh -make test KUBE_TIMEOUT="-timeout 300s" -``` - -### Set go flags during unit tests - -You can set [go flags](https://golang.org/cmd/go/) by setting the -`GOFLAGS` environment variable. - -### Run unit tests from certain packages - -`make test` accepts packages as arguments; the `k8s.io/kubernetes` prefix is -added automatically to these: - -```sh -make test WHAT=./pkg/api # run tests for pkg/api -``` - -To run multiple targets you need quotes: - -```sh -make test WHAT="./pkg/api ./pkg/kubelet" # run tests for pkg/api and pkg/kubelet -``` - -In a shell, it's often handy to use brace expansion: - -```sh -make test WHAT=./pkg/{api,kubelet} # run tests for pkg/api and pkg/kubelet -``` - -### Run specific unit test cases in a package - -You can set the test args using the `KUBE_TEST_ARGS` environment variable. -You can use this to pass the `-run` argument to `go test`, which accepts a -regular expression for the name of the test that should be run. - -```sh -# Runs TestValidatePod in pkg/api/validation with the verbose flag set -make test WHAT=./pkg/api/validation GOFLAGS="-v" KUBE_TEST_ARGS='-run ^TestValidatePod$' - -# Runs tests that match the regex ValidatePod|ValidateConfigMap in pkg/api/validation -make test WHAT=./pkg/api/validation GOFLAGS="-v" KUBE_TEST_ARGS="-run ValidatePod\|ValidateConfigMap$" -``` - -For other supported test flags, see the [golang -documentation](https://golang.org/cmd/go/#hdr-Testing_flags). - -### Stress running unit tests - -Running the same tests repeatedly is one way to root out flakes. -You can do this efficiently. - -```sh -# Have 2 workers run all tests 5 times each (10 total iterations). -make test PARALLEL=2 ITERATION=5 -``` - -For more advanced ideas please see [flaky-tests.md](flaky-tests.md). - -### Unit test coverage - -Currently, collecting coverage is only supported for the Go unit tests. - -To run all unit tests and generate an HTML coverage report, run the following: - -```sh -make test KUBE_COVER=y -``` - -At the end of the run, an HTML report will be generated with the path -printed to stdout. - -To run tests and collect coverage in only one package, pass its relative path -under the `kubernetes` directory as an argument, for example: - -```sh -make test WHAT=./pkg/kubectl KUBE_COVER=y -``` - -Multiple arguments can be passed, in which case the coverage results will be -combined for all tests run. - -### Benchmark unit tests - -To run benchmark tests, you'll typically use something like: - -```sh -go test ./pkg/apiserver -benchmem -run=XXX -bench=BenchmarkWatch -``` - -This will do the following: - -1. `-run=XXX` is a regular expression filter on the name of test cases to run -2. `-bench=BenchmarkWatch` will run test methods with BenchmarkWatch in the name - * See `grep -nr BenchmarkWatch .` for examples -3. `-benchmem` enables memory allocation stats - -See `go help test` and `go help testflag` for additional info. - -## Integration tests - -* Integration tests should only access other resources on the local machine - - Most commonly etcd or a service listening on localhost. -* All significant features require integration tests. - - This includes kubectl commands -* The preferred method of testing multiple scenarios or inputs -is [table driven testing](https://github.com/golang/go/wiki/TableDrivenTests) - - Example: [TestNamespaceAuthorization](https://git.k8s.io/kubernetes/test/integration/auth/auth_test.go) -* Each test should create its own master, httpserver and config. - - Example: [TestPodUpdateActiveDeadlineSeconds](https://git.k8s.io/kubernetes/test/integration/pods/pods_test.go) -* See [coding conventions](coding-conventions.md). - -### Install etcd dependency - -Kubernetes integration tests require your `PATH` to include an -[etcd](https://github.com/coreos/etcd/releases) installation. Kubernetes -includes a script to help install etcd on your machine. - -```sh -# Install etcd and add to PATH - -# Option a) install inside kubernetes root -hack/install-etcd.sh # Installs in ./third_party/etcd -echo export PATH="\$PATH:$(pwd)/third_party/etcd" >> ~/.profile # Add to PATH - -# Option b) install manually -grep -E "image.*etcd" cluster/gce/manifests/etcd.manifest # Find version -# Install that version using yum/apt-get/etc -echo export PATH="\$PATH:<LOCATION>" >> ~/.profile # Add to PATH -``` - -### Etcd test data - -Many tests start an etcd server internally, storing test data in the operating system's temporary directory. - -If you see test failures because the temporary directory does not have sufficient space, -or is on a volume with unpredictable write latency, you can override the test data directory -for those internal etcd instances with the `TEST_ETCD_DIR` environment variable. - -### Run integration tests - -The integration tests are run using `make test-integration`. -The Kubernetes integration tests are written using the normal golang testing -package but expect to have a running etcd instance to connect to. The `test-integration.sh` -script wraps `make test` and sets up an etcd instance for the integration tests to use. - -```sh -make test-integration # Run all integration tests. -``` - -This script runs the golang tests in package -[`test/integration`](https://git.k8s.io/kubernetes/test/integration). - -### Run a specific integration test - -You can also use the `KUBE_TEST_ARGS` environment variable with the `make test-integration` -to run a specific integration test case: - -```sh -# Run integration test TestPodUpdateActiveDeadlineSeconds with the verbose flag set. -make test-integration WHAT=./test/integration/pods GOFLAGS="-v" KUBE_TEST_ARGS="-run ^TestPodUpdateActiveDeadlineSeconds$" -``` - -If you set `KUBE_TEST_ARGS`, the test case will be run with only the `v1` API -version and the watch cache test is skipped. - -## End-to-End tests - -Please refer to [End-to-End Testing in Kubernetes](e2e-tests.md). +This file is a placeholder to preserve links. Please remove by April 30, 2019 or the release of kubernetes 1.13, whichever comes first.
\ No newline at end of file diff --git a/contributors/devel/writing-good-e2e-tests.md b/contributors/devel/writing-good-e2e-tests.md index 836479c2..b39208eb 100644 --- a/contributors/devel/writing-good-e2e-tests.md +++ b/contributors/devel/writing-good-e2e-tests.md @@ -1,231 +1,3 @@ -# Writing good e2e tests for Kubernetes # - -## Patterns and Anti-Patterns ## - -### Goals of e2e tests ### - -Beyond the obvious goal of providing end-to-end system test coverage, -there are a few less obvious goals that you should bear in mind when -designing, writing and debugging your end-to-end tests. In -particular, "flaky" tests, which pass most of the time but fail -intermittently for difficult-to-diagnose reasons are extremely costly -in terms of blurring our regression signals and slowing down our -automated merge velocity. Up-front time and effort designing your test -to be reliable is very well spent. Bear in mind that we have hundreds -of tests, each running in dozens of different environments, and if any -test in any test environment fails, we have to assume that we -potentially have some sort of regression. So if a significant number -of tests fail even only 1% of the time, basic statistics dictates that -we will almost never have a "green" regression indicator. Stated -another way, writing a test that is only 99% reliable is just about -useless in the harsh reality of a CI environment. In fact it's worse -than useless, because not only does it not provide a reliable -regression indicator, but it also costs a lot of subsequent debugging -time, and delayed merges. - -#### Debuggability #### - -If your test fails, it should provide as detailed as possible reasons -for the failure in its output. "Timeout" is not a useful error -message. "Timed out after 60 seconds waiting for pod xxx to enter -running state, still in pending state" is much more useful to someone -trying to figure out why your test failed and what to do about it. -Specifically, -[assertion](https://onsi.github.io/gomega/#making-assertions) code -like the following generates rather useless errors: - -``` -Expect(err).NotTo(HaveOccurred()) -``` - -Rather -[annotate](https://onsi.github.io/gomega/#annotating-assertions) your assertion with something like this: - -``` -Expect(err).NotTo(HaveOccurred(), "Failed to create %d foobars, only created %d", foobarsReqd, foobarsCreated) -``` - -On the other hand, overly verbose logging, particularly of non-error conditions, can make -it unnecessarily difficult to figure out whether a test failed and if -so why? So don't log lots of irrelevant stuff either. - -#### Ability to run in non-dedicated test clusters #### - -To reduce end-to-end delay and improve resource utilization when -running e2e tests, we try, where possible, to run large numbers of -tests in parallel against the same test cluster. This means that: - -1. you should avoid making any assumption (implicit or explicit) that -your test is the only thing running against the cluster. For example, -making the assumption that your test can run a pod on every node in a -cluster is not a safe assumption, as some other tests, running at the -same time as yours, might have saturated one or more nodes in the -cluster. Similarly, running a pod in the system namespace, and -assuming that will increase the count of pods in the system -namespace by one is not safe, as some other test might be creating or -deleting pods in the system namespace at the same time as your test. -If you do legitimately need to write a test like that, make sure to -label it ["\[Serial\]"](e2e-tests.md#kinds-of-tests) so that it's easy -to identify, and not run in parallel with any other tests. -1. You should avoid doing things to the cluster that make it difficult -for other tests to reliably do what they're trying to do, at the same -time. For example, rebooting nodes, disconnecting network interfaces, -or upgrading cluster software as part of your test is likely to -violate the assumptions that other tests might have made about a -reasonably stable cluster environment. If you need to write such -tests, please label them as -["\[Disruptive\]"](e2e-tests.md#kinds-of-tests) so that it's easy to -identify them, and not run them in parallel with other tests. -1. You should avoid making assumptions about the Kubernetes API that -are not part of the API specification, as your tests will break as -soon as these assumptions become invalid. For example, relying on -specific Events, Event reasons or Event messages will make your tests -very brittle. - -#### Speed of execution #### - -We have hundreds of e2e tests, some of which we run in serial, one -after the other, in some cases. If each test takes just a few minutes -to run, that very quickly adds up to many, many hours of total -execution time. We try to keep such total execution time down to a -few tens of minutes at most. Therefore, try (very hard) to keep the -execution time of your individual tests below 2 minutes, ideally -shorter than that. Concretely, adding inappropriately long 'sleep' -statements or other gratuitous waits to tests is a killer. If under -normal circumstances your pod enters the running state within 10 -seconds, and 99.9% of the time within 30 seconds, it would be -gratuitous to wait 5 minutes for this to happen. Rather just fail -after 30 seconds, with a clear error message as to why your test -failed ("e.g. Pod x failed to become ready after 30 seconds, it -usually takes 10 seconds"). If you do have a truly legitimate reason -for waiting longer than that, or writing a test which takes longer -than 2 minutes to run, comment very clearly in the code why this is -necessary, and label the test as -["\[Slow\]"](e2e-tests.md#kinds-of-tests), so that it's easy to -identify and avoid in test runs that are required to complete -timeously (for example those that are run against every code -submission before it is allowed to be merged). -Note that completing within, say, 2 minutes only when the test -passes is not generally good enough. Your test should also fail in a -reasonable time. We have seen tests that, for example, wait up to 10 -minutes for each of several pods to become ready. Under good -conditions these tests might pass within a few seconds, but if the -pods never become ready (e.g. due to a system regression) they take a -very long time to fail and typically cause the entire test run to time -out, so that no results are produced. Again, this is a lot less -useful than a test that fails reliably within a minute or two when the -system is not working correctly. - -#### Resilience to relatively rare, temporary infrastructure glitches or delays #### - -Remember that your test will be run many thousands of -times, at different times of day and night, probably on different -cloud providers, under different load conditions. And often the -underlying state of these systems is stored in eventually consistent -data stores. So, for example, if a resource creation request is -theoretically asynchronous, even if you observe it to be practically -synchronous most of the time, write your test to assume that it's -asynchronous (e.g. make the "create" call, and poll or watch the -resource until it's in the correct state before proceeding). -Similarly, don't assume that API endpoints are 100% available. -They're not. Under high load conditions, API calls might temporarily -fail or time-out. In such cases it's appropriate to back off and retry -a few times before failing your test completely (in which case make -the error message very clear about what happened, e.g. "Retried -http://... 3 times - all failed with xxx". Use the standard -retry mechanisms provided in the libraries detailed below. - -### Some concrete tools at your disposal ### - -Obviously most of the above goals apply to many tests, not just yours. -So we've developed a set of reusable test infrastructure, libraries -and best practices to help you to do the right thing, or at least do -the same thing as other tests, so that if that turns out to be the -wrong thing, it can be fixed in one place, not hundreds, to be the -right thing. - -Here are a few pointers: - -+ [E2e Framework](https://git.k8s.io/kubernetes/test/e2e/framework/framework.go): - Familiarise yourself with this test framework and how to use it. - Amongst others, it automatically creates uniquely named namespaces - within which your tests can run to avoid name clashes, and reliably - automates cleaning up the mess after your test has completed (it - just deletes everything in the namespace). This helps to ensure - that tests do not leak resources. Note that deleting a namespace - (and by implication everything in it) is currently an expensive - operation. So the fewer resources you create, the less cleaning up - the framework needs to do, and the faster your test (and other - tests running concurrently with yours) will complete. Your tests - should always use this framework. Trying other home-grown - approaches to avoiding name clashes and resource leaks has proven - to be a very bad idea. -+ [E2e utils library](https://git.k8s.io/kubernetes/test/e2e/framework/util.go): - This handy library provides tons of reusable code for a host of - commonly needed test functionality, including waiting for resources - to enter specified states, safely and consistently retrying failed - operations, usefully reporting errors, and much more. Make sure - that you're familiar with what's available there, and use it. - Likewise, if you come across a generally useful mechanism that's - not yet implemented there, add it so that others can benefit from - your brilliance. In particular pay attention to the variety of - timeout and retry related constants at the top of that file. Always - try to reuse these constants rather than try to dream up your own - values. Even if the values there are not precisely what you would - like to use (timeout periods, retry counts etc), the benefit of - having them be consistent and centrally configurable across our - entire test suite typically outweighs your personal preferences. -+ **Follow the examples of stable, well-written tests:** Some of our - existing end-to-end tests are better written and more reliable than - others. A few examples of well-written tests include: - [Replication Controllers](https://git.k8s.io/kubernetes/test/e2e/apps/rc.go), - [Services](https://git.k8s.io/kubernetes/test/e2e/network/service.go), - [Reboot](https://git.k8s.io/kubernetes/test/e2e/lifecycle/reboot.go). -+ [Ginkgo Test Framework](https://github.com/onsi/ginkgo): This is the - test library and runner upon which our e2e tests are built. Before - you write or refactor a test, read the docs and make sure that you - understand how it works. In particular be aware that every test is - uniquely identified and described (e.g. in test reports) by the - concatenation of its `Describe` clause and nested `It` clauses. - So for example `Describe("Pods",...).... It(""should be scheduled - with cpu and memory limits")` produces a sane test identifier and - descriptor `Pods should be scheduled with cpu and memory limits`, - which makes it clear what's being tested, and hence what's not - working if it fails. Other good examples include: - -``` - CAdvisor should be healthy on every node -``` - -and - -``` - Daemon set should run and stop complex daemon -``` - - On the contrary -(these are real examples), the following are less good test -descriptors: - -``` - KubeProxy should test kube-proxy -``` - -and - -``` -Nodes [Disruptive] Network when a node becomes unreachable -[replication controller] recreates pods scheduled on the -unreachable node AND allows scheduling of pods on a node after -it rejoins the cluster -``` - -An improvement might be - -``` -Unreachable nodes are evacuated and then repopulated upon rejoining [Disruptive] -``` - -Note that opening issues for specific better tooling is welcome, and -code implementing that tooling is even more welcome :-). +This file has moved to https://git.k8s.io/community/contributors/devel/sig-testing/writing-good-e2e-tests.md. +This file is a placeholder to preserve links. Please remove by April 30, 2019 or the release of kubernetes 1.13, whichever comes first.
\ No newline at end of file diff --git a/contributors/guide/OWNERS b/contributors/guide/OWNERS index a9abb261..745a9be0 100644 --- a/contributors/guide/OWNERS +++ b/contributors/guide/OWNERS @@ -1,3 +1,5 @@ +# See the OWNERS docs at https://go.k8s.io/owners + reviewers: - castrojo - guineveresaenger @@ -8,6 +10,7 @@ reviewers: approvers: - castrojo - parispittman + - guineveresaenger labels: - sig/contributor-experience - area/contributor-guide diff --git a/contributors/guide/README.md b/contributors/guide/README.md index 08bbdf95..e91eb0fc 100644 --- a/contributors/guide/README.md +++ b/contributors/guide/README.md @@ -217,14 +217,14 @@ When reviewing PRs from others [The Gentle Art of Patch Review](http://sage.thes ## Testing Testing is the responsibility of all contributors and is in part owned by all SIGss, but is also coordinated by [sig-testing](/sig-testing). -Refer to the [Testing Guide](/contributors/devel/testing.md) for more information. +Refer to the [Testing Guide](/contributors/devel/sig-testing/testing.md) for more information. There are multiple types of tests. The location of the test code varies with type, as do the specifics of the environment needed to successfully run the test: * Unit: These confirm that a particular function behaves as intended. Golang includes a native ability for unit testing via the [testing](https://golang.org/pkg/testing/) package. Unit test source code can be found adjacent to the corresponding source code within a given package. For example: functions defined in [kubernetes/cmd/kubeadm/app/util/version.go](https://git.k8s.io/kubernetes/cmd/kubeadm/app/util/version.go) will have unit tests in [kubernetes/cmd/kubeadm/app/util/version_test.go](https://git.k8s.io/kubernetes/cmd/kubeadm/app/util/version_test.go). These are easily run locally by any developer on any OS. * Integration: These tests cover interactions of package components or interactions between kubernetes components and some other non-kubernetes system resource (eg: etcd). An example would be testing whether a piece of code can correctly store data to or retrieve data from etcd. Integration tests are stored in [kubernetes/test/integration/](https://git.k8s.io/kubernetes/test/integration). Running these can require the developer set up additional functionality on their development system. -* End-to-end ("e2e"): These are broad tests of overall system behavior and coherence. These are more complicated as they require a functional kubernetes cluster built from the sources to be tested. A separate [document detailing e2e testing](/contributors/devel/e2e-tests.md) and test cases themselves can be found in [kubernetes/test/e2e/](https://git.k8s.io/kubernetes/test/e2e). +* End-to-end ("e2e"): These are broad tests of overall system behavior and coherence. These are more complicated as they require a functional kubernetes cluster built from the sources to be tested. A separate [document detailing e2e testing](/contributors/devel/sig-testing/e2e-tests.md) and test cases themselves can be found in [kubernetes/test/e2e/](https://git.k8s.io/kubernetes/test/e2e). * Conformance: These are a set of testcases, currently a subset of the integration/e2e tests, that the Architecture SIG has approved to define the core set of interoperable features that all Kubernetes deployments must support. For more information on Conformance tests please see the [Conformance Testing](/contributors/devel/sig-architecture/conformance-tests.md) Document. Continuous integration will run these tests either as pre-submits on PRs, post-submits against master/release branches, or both. diff --git a/contributors/guide/coding-conventions.md b/contributors/guide/coding-conventions.md index 87e6432f..880c1833 100644 --- a/contributors/guide/coding-conventions.md +++ b/contributors/guide/coding-conventions.md @@ -61,7 +61,7 @@ following Go conventions - `stateLock`, `mapLock` etc. - [Kubectl conventions](/contributors/devel/kubectl-conventions.md) - - [Logging conventions](/contributors/devel/logging.md) + - [Logging conventions](/contributors/devel/sig-instrumentation/logging.md) ## Testing conventions @@ -72,7 +72,7 @@ tests example, see [TestNamespaceAuthorization](https://git.k8s.io/kubernetes/test/integration/auth/auth_test.go) - Significant features should come with integration (test/integration) and/or -[end-to-end (test/e2e) tests](/contributors/devel/e2e-tests.md) +[end-to-end (test/e2e) tests](/contributors/devel/sig-testing/e2e-tests.md) - Including new kubectl commands and major features of existing commands - Unit tests must pass on macOS and Windows platforms - if you use Linux @@ -86,7 +86,7 @@ required when your code does not compile on Windows). asynchronous thing to happen (e.g. wait for 1 seconds and expect a Pod to be running). Wait and retry instead. - - See the [testing guide](/contributors/devel/testing.md) for additional testing advice. + - See the [testing guide](/contributors/devel/sig-testing/testing.md) for additional testing advice. ## Directory and file conventions diff --git a/contributors/guide/contributor-cheatsheet.md b/contributors/guide/contributor-cheatsheet.md index 180a368f..320cd980 100644 --- a/contributors/guide/contributor-cheatsheet.md +++ b/contributors/guide/contributor-cheatsheet.md @@ -20,7 +20,7 @@ A list of common resources when contributing to Kubernetes. - [GitHub labels](https://go.k8s.io/github-labels) - [Release Buckets](https://gcsweb.k8s.io/gcs/kubernetes-release/) - Developer Guide - - [Cherry Picking Guide](/contributors/devel/cherry-picks.md) + - [Cherry Picking Guide](/contributors/devel/sig-release/cherry-picks.md) - [Kubernetes Code Search](https://cs.k8s.io/), maintained by [@dims](https://github.com/dims) diff --git a/contributors/guide/github-workflow.md b/contributors/guide/github-workflow.md index 221a7921..cc1e7e8f 100644 --- a/contributors/guide/github-workflow.md +++ b/contributors/guide/github-workflow.md @@ -149,17 +149,17 @@ make test make test WHAT=./pkg/api/helper GOFLAGS=-v # Run integration tests, requires etcd -# For more info, visit https://git.k8s.io/community/contributors/devel/testing.md#integration-tests +# For more info, visit https://git.k8s.io/community/contributors/devel/sig-testing/testing.md#integration-tests make test-integration # Run e2e tests by building test binaries, turn up a test cluster, run all tests, and tear the cluster down # Equivalent to: go run hack/e2e.go -- -v --build --up --test --down # Note: running all e2e tests takes a LONG time! To run specific e2e tests, visit: -# https://git.k8s.io/community/contributors/devel/e2e-tests.md#building-kubernetes-and-running-the-tests +# https://git.k8s.io/community/contributors/devel/sig-testing/e2e-tests.md#building-kubernetes-and-running-the-tests make test-e2e ``` -See the [testing guide](/contributors/devel/testing.md) and [end-to-end tests](/contributors/devel/e2e-tests.md) +See the [testing guide](/contributors/devel/sig-testing/testing.md) and [end-to-end tests](/contributors/devel/sig-testing/e2e-tests.md) for additional information and scenarios. Run `make help` for additional information on these make targets. diff --git a/contributors/guide/issue-triage.md b/contributors/guide/issue-triage.md index ff67ba3e..879648a9 100644 --- a/contributors/guide/issue-triage.md +++ b/contributors/guide/issue-triage.md @@ -206,7 +206,7 @@ block the release on it. A few days before release, we will probably move all that milestone in bulk. More information can be found in the developer guide section for -[targeting issues and PRs to a milestone release](/contributors/devel/release.md). +[targeting issues and PRs to a milestone release](/contributors/devel/sig-release/release.md). ## Closing issues Issues that are identified as a support request, duplicate, not-reproducible diff --git a/contributors/guide/pull-requests.md b/contributors/guide/pull-requests.md index a24310a6..a9c26086 100644 --- a/contributors/guide/pull-requests.md +++ b/contributors/guide/pull-requests.md @@ -115,7 +115,7 @@ The GitHub robots will add and remove the `do-not-merge/hold` label as you use t ## Pull Requests and the Release Cycle -If a pull request has been reviewed, but held or not approved, it might be due to the current phase in the [Release Cycle](/contributors/devel/release.md). Occasionally, a SIG may freeze their own code base when working towards a specific feature or goal that could impact other development. During this time, your pull request could remain unmerged while their release work is completed. +If a pull request has been reviewed, but held or not approved, it might be due to the current phase in the [Release Cycle](/contributors/devel/sig-release/release.md). Occasionally, a SIG may freeze their own code base when working towards a specific feature or goal that could impact other development. During this time, your pull request could remain unmerged while their release work is completed. If you feel your pull request is in this state, contact the appropriate [SIG](https://git.k8s.io/community/sig-list.md) or [SIG-Release](https://git.k8s.io/sig-release) for clarification. diff --git a/contributors/guide/release-notes.md b/contributors/guide/release-notes.md index 655dff1c..81dca597 100644 --- a/contributors/guide/release-notes.md +++ b/contributors/guide/release-notes.md @@ -30,4 +30,4 @@ For pull requests that don't need to be mentioned at release time, use the `/rel To see how to format your release notes, view the kubernetes/kubernetes [pull request template](https://git.k8s.io/kubernetes/.github/PULL_REQUEST_TEMPLATE.md) for a brief example. Pull Request titles and body comments can be modified at any time prior to the release to make them friendly for release notes. -Release notes apply to pull requests on the master branch. For cherry-pick pull requests, see the [cherry-pick instructions](/contributors/devel/cherry-picks.md). The only exception to these rules is when a pull request is not a cherry-pick and is targeted directly to the non-master branch. In this case, a `release-note-*` label is required for that non-master pull request. +Release notes apply to pull requests on the master branch. For cherry-pick pull requests, see the [cherry-pick instructions](/contributors/devel/sig-release/cherry-picks.md). The only exception to these rules is when a pull request is not a cherry-pick and is targeted directly to the non-master branch. In this case, a `release-note-*` label is required for that non-master pull request. diff --git a/contributors/guide/style-guide.md b/contributors/guide/style-guide.md new file mode 100644 index 00000000..05ccbb04 --- /dev/null +++ b/contributors/guide/style-guide.md @@ -0,0 +1,678 @@ +--- +title: Documentation Style Guide +--- + +# Documentation Style Guide + +This style guide is for content in the Kubernetes github [community repository]. +It is an extension of the [Kubernetes documentation style-guide]. + +These are **guidelines**, not rules. Use your best judgement. + +- [Cheatsheet](#cheatsheet) +- [Content design, formatting, and language](#content-formatting-and-language) + - [Contact information](#contact-information) + - [Dates and times](#dates-and-times) + - [Diagrams, images and other assets](#diagrams-images-and-other-assets) + - [Document Layout](#document-layout) + - [Formatting text](#formatting-text) + - [Language, grammar, and tone](#language-grammar-and-tone) + - [Moving a document](#moving-a-document) + - [Punctuation](#punctuation) + - [Quotation](#quotation) +- [Markdown formatting](#markdown-and-formatting) + - [Code Blocks](code-blocks) + - [Emphasis](#emphasis) + - [Headings](#headings) + - [Horizontal Lines](#horizontal-lines) + - [Line Length](#line-length) + - [Links](#links) + - [Lists](#lists) + - [Metadata](#metadata) + - [Tables](#tables) +- [Attribution](#attribution) + + +## Cheatsheet + +### Cheatsheet: Content design, formatting, and language + +**[Contact information:](#contact-information)** +- Use official Kubernetes contact information. + +**[Dates and times:](#dates-and-times)** +- Format dates as `month day, year`. (December 13, 2018) +- When conveying a date in numerical form, use [ISO 8601] Format: `yyyy-mm-dd`. +- Use the 24 hour clock when referencing time. +- Times for single events (example: KubeCon) should be expressed in an absolute + time zone such as Pacific Standard Time (PST) or Coordinated Universal Time + (UTC). +- Times for reoccurring events should be expressed in a time zone that follows + Daylight Savings Time (DST) such as Pacific Time (PT) or Eastern Time (ET). +- Supply a link to a globally available time zone converter service. + - `http://www.thetimezoneconverter.com/?t=<TIME REFERENCE>&tz=<TZ REFERENCE>` + +**[Diagrams, images and other assets:](#diagrams-images-and-other-assets)** +- Images and other assets should be stored in the same directory as the document + that is referencing it. +- Filenames should be lowercase and descriptive of what they are referencing. +- Avoid excessively large images or include a smaller one while linking to a + higher resolution version of the same image. +- Use the [Kubernetes icon set] for architectural diagrams. + +**[Document Layout:](#document-layout)** +- Documents should follow the general template of: + - Document metadata (if appropriate). + - Title in `H1` (a single `#`). + - A brief description or summary of the document. + - A table of contents. + - The general body of document. +- Do not repeat content. Instead link back to the canonical source. +- Large content or topic shifts should be separated with a horizontal rule. + +**[Formatting text:](#formatting-text)** +- API objects: + - Follow the established [API naming convention] when referring to API Objects. + - Do not split API object names into their components. + - Use `code` style for API objects or object parameters. +- Use **bold text** for user interface elements. +- Use _italics_ to emphasize a new topic or subject for the first time. +- Use angle brackets (`<` and `>`) to enclose a placeholder reference. +- Apply `code` styling to: + - Filenames, directories, and paths. + - Command line examples and flags. + - Object field names. + +**[Language, grammar and tone:](#language)** +- Documentation should be written in English. +- Prefer an active voice and present tense when possible. +- Use simple and direct language. +- Use gender-neutral language. +- Avoid personal pronouns ("I," "we," "us," "our," and "ours"). +- Address the reader as "you" instead of "we". +- Do not use Latin phrases. +- Avoid jargon and idioms. +- If using acronyms, ensure they are clearly defined in the same document. +- If using an abbreviation, spell it out the first time it is used in the + document unless it is commonly known. (example: TCP/IP) + +**[Moving a document:](#moving-a-document)** +- Use `[git-mv]` to move documents. +- Commit moved documents separately from any other changes. +- When a document has moved, leave a tombstone file with a removal date in its + place. + +**[Punctuation:](#punctuation)** +- Do not use punctuation in headings. +- End full sentences with a period. + - **Exception:** When a sentence ends with a URL or if the text would be + unclear if the period is a part of the previous object or word. +- Add a single space after a period when beginning a new sentence. +- Avoid usage of exclamation points unless they are a part of a code example. +- Use an [Oxford comma] when a list contains 3 or more elements. + +**[Quotation:](#quotes)** +- Use double-quotation marks (`" "`) over single-quotation marks (`' '`). + - **Exception:** In code snippets where quotation marks have specific meaning. + - **Exception:** When nesting quotation marks inside another set of quotation + marks. +- Punctuation should be outside of quotation marks following the international + (British) standard. + + +### Cheatsheet: Markdown + +**[Code blocks:](#code-blocks)** +- When possible, reference the language at the beginning of a Code Block. +- When a code block is used to reference a shell, do not include the command + prompt (`$`). + - **Exception:** When a code block is used to display raw shell output. +- Separate commands from output. + +**[Emphasis:](#emphasis)** +- Use two asterisks (`**`) for **Bold** text. +- Use an underscore (`_`) for _Italics_. +- Use two tildes (`~~`) for ~~Strikethrough~~. + +**[Headings:](#headings)** +- Use a single `H1` (`#`) Heading per document. + - **Exception:** `H1` may be used multiple times in the same document when + there is a large content shift or "chapter" change. +- Follow the Header hierarchy of `H2` > `H3` > `H4` > `H5` > `H6`. +- Use sentence-style capitalization in titles (first word and proper nouns). +- Avoid using special characters. +- Leave exactly 1 new line after a heading. +- Avoid using links in headings. + +**[Horizontal rules:](#horizontal-lines)** +- Use three dashes (`---`) to denote a horizontal rule. +- Use a horizontal rule (`---`) to logically separate large sections. + +**[Line length:](#line-length)** +- Prefer an 80 character line limit. + +**[Links:](#links)** +- Prefer using reference style links over inline style links. +- When linking within the same directory, use a relative link. +- When linking to a document outside of the current directory, use the absolute + path from the root of the repository. +- When linking to a file in another Kubernetes github repository, use the + `k8s.io` url shortener. + - git.k8s.io -> github.com/kubernetes + - sigs.k8s.io -> github.com/kubernetes-sigs + +**[Lists:](#lists)** +- Capitalize the first character of each entry unless the item is explicitly + case sensitive. +- End each entry with a period if it is a sentence or phrase. +- Use a colon (`:`) to separate a list item name from the explanatory text. +- Leave a blank line after each list. +- Use `-` for unordered lists. +- For ordered lists repeating `1.` may be used. +- When inserting a code block into an ordered list, indent (space) an additional + two times. + +**[Metadata:](metadata)** +- If the document is intended to be surfaced on the Contributor Site; include a + yaml metadata header at the beginning of the document. +- Metadata must include the `title` attribute. + +**[Tables:](#tables)** +- Use tables for structured information. +- Tables do not need to adhere to the suggested line length. +- Avoid long inline links. +- Do not use excessively wide tables. + +--- + +## Content design, formatting, and language + +### Contact information + +- Use official Kubernetes contact information. + - Use official community contact email addresses. There should be no personal + or work contact information included in public documentation; instead use + addresses like the [SIG Google groups] or managed accounts such as + community@kubernetes.io. + - **Good example:** community@kubernetes.io + - **Bad example:** bob@example.com + + +### Dates and times + +The Kubernetes Contributor Community spans many regions and time zones. +Following a consistent pattern and avoiding shorthand improves the readability +for every member. + +- Format dates as `month day, year`. (December 13, 2018) + - **Good example:** October 24, 2018 + - **Bad example:** 10/24/18 +- When conveying a date in numerical form, use [ISO 8601] Format: `yyyy-mm-dd`. + - **Good example:** 2018-10-24 + - **Bad example:** 10/24/18 +- Use the 24 hour clock when referencing time. + - **Good example:** 15:30 + - **Bad example:** 3:30pm +- Times for single events (example: KubeCon) should be expressed in an absolute + time zone such as Pacific Standard Time (PST) or Coordinated Universal Time + (UTC). + - **Good example:** The Seattle Contributor Summit starts at 9:00 PST + - **Bad example:** The Seattle Contributor Summit starts at 9:00 PT +- Times for reoccurring events should be expressed in a time zone that follows + Daylight Savings Time (DST) such as Pacific Time (PT) or Eastern Time (ET). + - Times that follow DST are used as they adjusts automatically. If UTC or + other non-DST compatible time zones were used, content would have to be + updated multiple times per year to adjust times. + - **Good example:** 13:30 PT + - **Bad example:** 16:30 EST +- Supply a link to a globally available time zone converter service. + - `http://www.thetimezoneconverter.com/?t=<TIME REFERENCE>&tz=<TZ REFERENCE>` + + ``` + The weekly SIG meeting is at [13:30 PT]. + + [13:30 PT]: http://www.thetimezoneconverter.com/?t=13:30&tz=PT%20%28Pacific%20Time%29 + ``` + + +### Diagrams, images and other assets + +- Images and other assets should be stored in the same directory as the document + that is referencing it. +- Filenames should be lowercase and descriptive of what they are referencing. + - **Good example:** `deployment-workflow.jpg` + - **Bad example:** `image1.jpg` +- Avoid excessively large images or include a smaller one while linking to a + higher resolution version of the same image. +- Use the [Kubernetes icon set] for architectural diagrams. + + +### Document Layout + +Adhering to a standard document layout ensures that each page can intuitively +be navigated once a reader is familiar with the standard layout. + +- Documents should follow the general template of: + - Document metadata (if appropriate). + - Title in `H1` (a single `#`). + - A brief description or summary of the document. + - A table of contents. + - The general body of document. +- Do not repeat content. Instead link back to the canonical source. + - It is easy for content to become out of sync if it is maintained in + multiple places. Linking back to the canonical source ensures that the + documentation will be accurate and up to date. +- Large content or topic shifts should be separated with a horizontal rule. + + +### Formatting text + +The formatting guidelines have been selected to mirror or augment the +[Kubernetes documentation style-guide]. Remaining consistent across the +different content sources improves the overall readability and understanding of +the documentation being presented in addition to giving the project a unified +external appearance. + +- API objects: + - Follow the established [API naming convention] when referring to API Objects. + - Do not split API object names into their components. + - **Good example:** A `Pod` contains a `PodTemplateSpec`. + - **Bad example:** A `Pod` contains a `Pod Template Spec`. + - Use `code` style for API objects or object parameters. + - **Good example:** A `Deployment` contains a `DeploymentSpec`. + - **Bad example:** A Deployment contains a DeploymentSpec. +- Use angle brackets (`<` and `>`) to surround a placeholder references. + - **Good example:** `kubectl describe pod <pod-name>` + - **Bad example:** `kubectl describe pod pod-name` +- Use **bold text** for user interface elements. + - **Good example:** Select **Other**. + - **Bad example:** Select "Other". +- Use _italic text_ to emphasize a new subject for the first time. + - **Good example:** A _cluster_ is a set of nodes. + - **Bad example:** A "cluster" is a set of nodes. +- `Code` styling should be applied to: + - Filenames, directories, and paths. + - **Good example:** The default manifest path is `/etc/kubernetes/manifests`. + - **Bad example:** The default manifest path is /etc/kubernetes/manifests. + - Command line examples and flags. + - **Good example:** The flag `--advertise-address` is used to denote the + IP address on which to advertise the apiserver to members of the cluster. + - **Bad example:** The flag --advertise-address is used to denote the IP + address on which to advertise the apiserver to members of the cluster. + - Object field names. + - **Good example:** Set the `externalTrafficPolicy` to Local. + - **Bad example:** Set the externalTrafficPolicy to Local. + + +### Language, grammar and tone + +- Documentation should be written in English. +- Prefer an active voice and present tense when possible. + - Active voice is when the subject of the sentence performs the action. + Whereas with passive voice the subject receives the action. Writing with an + active voice in mind easily conveys to the reader who or what is performing + the action. + - **Good example:** Updating the Deployment triggers a new ReplicaSet to be + created. + - **Bad example:** A ReplicaSet is created by updating the Deployment. +- Use simple and direct language. + - Avoid using unnecessary or extra language. Be straightforward and direct. + - **Good example:** Wait for the Pod to start. + - **Bad example:** Please be patient and wait for the Pod to start. +- Use gender-neutral language. + - Avoid gendered pronouns preferring the [singular "they"][singular-they] + unless referring to the person's by their preferred gender. For further + information on the subject, see [Microsoft's guide to bias-free communication] + and [Wikipedia's entry for the Singular they]. + - **Good example:** chair or moderator + - **Bad example:** chairman +- Avoid personal pronouns ("I," "we," "us," "our," and "ours"). + - In most cases personal pronouns should be avoided as they can lead to + confusion regarding who they are referring to. + - **Good example:** The release-team shepherded the successful release of 1.13. + - **Bad example:** We shepherded the successful release of 1.13. +- Address the reader as "you" instead of "we". + - Addressing the reader directly using "you" clearly denotes the target. + There is no confusion as there would be with "we" or "us". + - **Good example:** You will create a new cluster with kubeadm. + - **Bad example:** We will create a new cluster with kubeadm. +- Do not use Latin phrases. + - [Latin phrases] can make it difficult for readers not familiar with them to + grasp their meaning. + - Some useful alternatives include: + + | Latin Phrase | Alternative | + |:------------:|:-----------:| + | e.g. | for example | + | et al. | and others | + | i.e. | that is | + | via | using | + + - **Good example:** For example Deployments, ReplicaSets... + - **Bad example:** e.g. Deployments, ReplicaSets... +- Avoid jargons and idioms. + - Jargon and idioms tend to rely on regional or tribal knowledge. They make + it difficult to understand for both newcomers and those whose native + language is something other than English. They should be avoided when + possible. + - **Good example:** Internally, the kube-apiserver... + - **Bad example:** Under the hood the kube-apiserver... + - **Good example:** We will start the project in early 2019. + - **Bad example:** We will kick off the initiative in 2019. +- If using an abbreviation, spell it out the first time it is used in the + document unless it is commonly known. (example: TCP/IP) + - Abbreviations in this context applies to abbreviations, acronyms and + initialisms. + - **Good example:** A _CustomResourceDefinition_ (CRD) extends the Kubernetes + API. + - **Bad example:** A CRD extends the Kubernetes API. + + +### Moving a Document + +- Use `[git-mv]` to move documents. + - `git-mv` will safely move/rename a file, directory, or symlink and + automatically update the git index. + - **Good example:** `git mv /old/mydoc.md /new/mydoc.md` + - **Bad example:** `mv /old/mydoc.md /new/mydoc.md` +- Commit moved documents separately from any other changes. + - A separate commit clearly preserves the history of the relocated documents + and makes it easier to review. +- When a document has moved, leave a tombstone file with a removal date in its + place. + - Tombstones function as a pointer and give users a time to update their own + documentation and bookmarks. Their usefulness is time-bounded and should be + removed when they would logically no longer serve their purpose. + ```markdown + This file has moved to https://git.k8s.io/community/contributors/guide/README.md. + + This file is a placeholder to preserve links. Please remove after 2019-03-10 or the release of kubernetes 1.10, whichever comes first. + ``` + + +### Punctuation + +- Do not use punctuation in headings. +- End full sentences with a period. + - **Exception:** When a sentence ends with a URL or if the text would be + unclear if the period is a part of the previous object or word. +- Add a single space after a period when beginning a new sentence. +- Avoid usage of exclamation points unless they are a part of a code example. +- Use an [Oxford comma] when a list contains 3 or more elements. + - **Good example:** Deployments, ReplicaSets, and DaemonSets. + - **Bad example:** Deployments, ReplicaSets and DaemonSets. + + +### Quotation + +- Use double-quotation marks (`" "`) over single-quotation marks (`' '`). + - **Exception:** In code snippets where quotation marks have specific meaning. + - **Exception:** When nesting quotation marks inside another set of quotation + marks. +- Punctuation should be outside of quotation marks following the international + (British) standard. + + +--- + + +## Markdown formatting + +### Code blocks + +- When possible, reference the language at the beginning of a Code Block. + - The two markdown renderers used by the Kubernetes community + ([GitHub][gh-code-hl-list] and [Hugo][hugo-code-hl-list]) support code + highlighting. This can be enabled by supplying the name of the language + after the three back-ticks (`` ``` ``) at the start of a code block. + - **Good example:** + ````` + ```go + import ( + "fmt" + ... + ) + ``` + ````` + - **Bad example:** + ````` + ``` + import ( + "fmt" + ... + ) + ``` + ````` +- When a code block is used to reference a shell, do not include the command + prompt (`$`) + - When a code block is referencing a shell, it is implied that it is a + command prompt. The exception to this is when a code block is being used + for raw shell output such as debug logs. + - **Good example:** + ``` + kubectl get pods -o wide + ``` + - **Bad example:** + ``` + $ kubectl get pods -o wide + ``` +- Separate commands from output. + - Separating the command from the output makes both the command and output + more generally readable. + - **Good example:** + ``` + kubectl get pods + ``` + ``` + NAME READY STATUS RESTARTS AGE IP NODE + nginx 1/1 Running 0 13s 10.200.0.4 worker0 + ``` + - **Bad example:** + ``` + kubectl get pods + NAME READY STATUS RESTARTS AGE IP NODE + nginx 1/1 Running 0 13s 10.200.0.4 worker0 + ``` + + +### Emphasis + +Markdown has multiple ways of indicating each type of emphasis. Adhering to a +standard across documentation improves supportability. + +- Use two asterisks (`**`) for **Bold** text. + - **Good example:** `This is **bold** text.` + - **Bad example:** `This should not be used for __bold__.` +- Use an underscore (`_`) for _Italics_. + - **Good example:** This is _italics_.` + - **Bad example:** This should not be used for *italics*.` +- Use two tildes (`~~`) for ~~Strikethrough~~. + - **Good example:** `This is ~~strikethrough~~` + - **Bad example:** `This should not be used for ~strikethrough~.` + + +### Headings + + Adhering to a standard across documentation improves both readability and + overall supportability across multiple documents. + +- Use a single `H1` (`#`) Heading per document. + - **Exception:** `H1` may be used multiple times in the same document when + there is a large content shift or "chapter" change. +- Follow the Header hierarchy of `H2` > `H3` > `H4` > `H5` > `H6`. +- Use sentence-style capitalization in titles (first word and proper nouns). +- Avoid using special characters. +- Leave exactly 1 new line after a heading. +- Avoid using links in headings. + + +### Horizontal rules + +Markdown has multiple ways of indicating a horizontal rule. Adhering to a +standard across documentation improves supportability. + +- Use three dashes (`---`) to denote a horizontal rule. + - **Good example:** `---` + - **Bad example:** `===` +- Use a horizontal rule (`---`) to logically separate large sections. + + +### Line length + +- Prefer an 80 character line limit. + - There is no specific general best practice for Markdown line length. The + commonly used 80 character guideline is preferable for general text review + and editing. + + +### Links + +Markdown provides two primary methods to link to content: inline links and +relative links. However, how and what they're being linked to can vary widely. + +- Prefer using reference style links over inline style links. + - Reference links are shorter and easier to read. They have the added benefit + of being reusable throughout the entire document. + - The link itself should be at the bottom of the document. If the document is + large or covers many topics, place the link at the end of the logical + chapter or section. + - **Example:** + ``` + See the [Code of Conduct] for more information. + + [code of conduct]: https://git.k8s.io/community/code-of-conduct.md + ``` + - **Example:** + ``` + See the [Code of Conduct][coc] for more information. + + [coc]: https://git.k8s.io/community/code-of-conduct.md + ``` +- When linking within the same directory, use a relative link. + - Links to files within the same directory are short and readable already. + They do not warrant expanding the full path. + - When the file is referenced multiple times within the same document, + consider using a reference link for a quicker shorthand reference. + - **Example:** + ``` + See the [Code of Conduct](code-of-conduct.md) for more information + ``` + - **Example:** + ``` + See the [Code of Conduct][coc] for more information + + [coc]: code-of-conduct.md + ``` +- When linking to a document outside of the current directory, use the absolute + path from the root of the repository. + - Using the absolute path ensures that if the source document is relocated, + the link to the target or destination document will remain intact and not + have to be updated. + - **Example:** + ``` + See the [Coding Convention] doc for more information. + + [Coding Convention]: /contributors/guide/coding-conventions.md + ``` +- When linking to a file in another Kubernetes github repository, use the + `k8s.io` url shortener. + - The shorthand version will auto-expand linking to documents within the + master branch and can be used for multiple purposes. + + | Short URL | Expanded URL | + |:-------------------:|:----------------------------------:| + | https://git.k8s.io | https://github.com/kubernetes | + | https://sigs.k8s.io | https://github.com/kubernetes-sigs | + + - **Example:** + ``` + The super cool [prow tool] resides in the test-infra repo under the kubernetes organization + + [prow tool]: https://git.k8s.io/test-infra/prow/README.md + ``` + + +### Lists + + Adhering to a standard across documentation improves both readability and + overall supportability across multiple documents. + +- Capitalize the first character of each entry unless the item is explicitly + case sensitive. +- End each entry with a period if it is a sentence or phrase. +- Use a colon (`:`) to separate a list item name from the explanatory text. +- Leave a blank line after each list. +- Use `-` for unordered lists. +- For ordered lists a repeating `1.` may be used. +- When inserting a code block into an ordered list, indent (space) an additional + two times. + + +### Metadata + +- If the document is intended to be surfaced on the Contributor Site; include a + yaml metadata header at the beginning of the document. + - If the document is to be added to the Contributor Site, adding metadata + at the beginning of the document will improve the overall presentation of + the information. This metadata is similar to the metadata used in the + KEP process and is often referred to as _Frontmatter_ in common static + site generators such as [Jekyll] and [Hugo]. + - The metadata header is a yaml block between two sets of `---`. + - **Example:** + ``` + --- + title: Super Awesome Doc + --- + ``` +- Metadata must include the `title` attribute. + - `title` will be used as the title of the document when rendered with + [Hugo]. + + +### Tables + +- Use tables for structured information. + - **Example:** + ``` + | Column 1 | Column 2 | Column 3 | + |:--------------:|:--------------:|:--------------:| + | test 1 | test 2 | test 3 | + | another test 1 | another test 2 | another test 3 | + ``` +- Tables do not need to adhere to the suggested line length. + - Markdown tables have an inherently longer line length, and cannot be + line wrapped. +- Avoid long inline links. + - Long inline links can make it difficult to work with markdown tables. + Prefer to use reference style links instead. +- Do not use excessively wide tables. + - Large wide tables do not render well. Try to break the information down + into something more easily presentable. + + +## Attribution + +This style guide is heavily influenced by the great work from the content +management teams from [SIG-Docs], [Gitlab], [Google], and [Microsoft]. Without +their previous efforts this guide would not be nearly as concise as it should. + +[community repository]: https://git.k8s.io/community +[Kubernetes documentation style-guide]: https://kubernetes.io/docs/contribute/style/style-guide/ +[SIG Google groups]: /sig-list.md +[ISO 8601]: https://en.wikipedia.org/wiki/ISO_8601 +[kubernetes icon set]: /icons +[API naming convention]: /contributors/devel/api-conventions.md#naming-conventions +[singular-they]: https://en.wikipedia.org/wiki/Singular_they +[Microsoft's guide to bias-free communication]: https://docs.microsoft.com/en-us/style-guide/bias-free-communication +[Wikipedia's entry for the Singular they]: https://en.wikipedia.org/wiki/Singular_they +[Latin phrases]: https://en.wikipedia.org/wiki/List_of_Latin_abbreviations +[Oxford comma]: https://www.grammarly.com/blog/what-is-the-oxford-comma-and-why-do-people-care-so-much-about-it/ +[gh-code-hl-list]: https://github.com/github/linguist/blob/master/lib/linguist/languages.yml +[hugo-code-hl-list]: http://www.rubycoloredglasses.com/2013/04/languages-supported-by-github-flavored-markdown/ +[git-mv]: https://git-scm.com/docs/git-mv +[jekyll]: https://jekyllrb.com/ +[hugo]: https://gohugo.io/ +[gitlab]: https://docs.gitlab.com/ee/development/documentation/styleguide.html +[google]: https://developers.google.com/style/ +[microsoft]: https://docs.microsoft.com/en-us/style-guide/welcome/ +[sig-docs]: https://kubernetes.io/docs/contribute/style/style-guide/ |
