1. 26 Sep, 2018 1 commit
  2. 09 Sep, 2018 1 commit
  3. 12 Jun, 2018 1 commit
    • Kees Cook's avatar
      treewide: kzalloc() -> kcalloc() · 6396bb22
      Kees Cook authored
      The kzalloc() function has a 2-factor argument form, kcalloc(). This
      patch replaces cases of:
      
              kzalloc(a * b, gfp)
      
      with:
              kcalloc(a * b, gfp)
      
      as well as handling cases of:
      
              kzalloc(a * b * c, gfp)
      
      with:
      
              kzalloc(array3_size(a, b, c), gfp)
      
      as it's slightly less ugly than:
      
              kzalloc_array(array_size(a, b), c, gfp)
      
      This does, however, attempt to ignore constant size factors like:
      
              kzalloc(4 * 1024, gfp)
      
      though any constants defined via macros get caught up in the conversion.
      
      Any factors with a sizeof() of "unsigned char", "char", and "u8" were
      dropped, since they're redundant.
      
      The Coccinelle script used for this was:
      
      // Fix redundant parens around sizeof().
      @@
      type TYPE;
      expression THING, E;
      @@
      
      (
        kzalloc(
      -	(sizeof(TYPE)) * E
      +	sizeof(TYPE) * E
        , ...)
      |
        kzalloc(
      -	(sizeof(THING)) * E
      +	sizeof(THING) * E
        , ...)
      )
      
      // Drop single-byte sizes and redundant parens.
      @@
      expression COUNT;
      typedef u8;
      typedef __u8;
      @@
      
      (
        kzalloc(
      -	sizeof(u8) * (COUNT)
      +	COUNT
        , ...)
      |
        kzalloc(
      -	sizeof(__u8) * (COUNT)
      +	COUNT
        , ...)
      |
        kzalloc(
      -	sizeof(char) * (COUNT)
      +	COUNT
        , ...)
      |
        kzalloc(
      -	sizeof(unsigned char) * (COUNT)
      +	COUNT
        , ...)
      |
        kzalloc(
      -	sizeof(u8) * COUNT
      +	COUNT
        , ...)
      |
        kzalloc(
      -	sizeof(__u8) * COUNT
      +	COUNT
        , ...)
      |
        kzalloc(
      -	sizeof(char) * COUNT
      +	COUNT
        , ...)
      |
        kzalloc(
      -	sizeof(unsigned char) * COUNT
      +	COUNT
        , ...)
      )
      
      // 2-factor product with sizeof(type/expression) and identifier or constant.
      @@
      type TYPE;
      expression THING;
      identifier COUNT_ID;
      constant COUNT_CONST;
      @@
      
      (
      - kzalloc
      + kcalloc
        (
      -	sizeof(TYPE) * (COUNT_ID)
      +	COUNT_ID, sizeof(TYPE)
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	sizeof(TYPE) * COUNT_ID
      +	COUNT_ID, sizeof(TYPE)
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	sizeof(TYPE) * (COUNT_CONST)
      +	COUNT_CONST, sizeof(TYPE)
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	sizeof(TYPE) * COUNT_CONST
      +	COUNT_CONST, sizeof(TYPE)
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	sizeof(THING) * (COUNT_ID)
      +	COUNT_ID, sizeof(THING)
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	sizeof(THING) * COUNT_ID
      +	COUNT_ID, sizeof(THING)
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	sizeof(THING) * (COUNT_CONST)
      +	COUNT_CONST, sizeof(THING)
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	sizeof(THING) * COUNT_CONST
      +	COUNT_CONST, sizeof(THING)
        , ...)
      )
      
      // 2-factor product, only identifiers.
      @@
      identifier SIZE, COUNT;
      @@
      
      - kzalloc
      + kcalloc
        (
      -	SIZE * COUNT
      +	COUNT, SIZE
        , ...)
      
      // 3-factor product with 1 sizeof(type) or sizeof(expression), with
      // redundant parens removed.
      @@
      expression THING;
      identifier STRIDE, COUNT;
      type TYPE;
      @@
      
      (
        kzalloc(
      -	sizeof(TYPE) * (COUNT) * (STRIDE)
      +	array3_size(COUNT, STRIDE, sizeof(TYPE))
        , ...)
      |
        kzalloc(
      -	sizeof(TYPE) * (COUNT) * STRIDE
      +	array3_size(COUNT, STRIDE, sizeof(TYPE))
        , ...)
      |
        kzalloc(
      -	sizeof(TYPE) * COUNT * (STRIDE)
      +	array3_size(COUNT, STRIDE, sizeof(TYPE))
        , ...)
      |
        kzalloc(
      -	sizeof(TYPE) * COUNT * STRIDE
      +	array3_size(COUNT, STRIDE, sizeof(TYPE))
        , ...)
      |
        kzalloc(
      -	sizeof(THING) * (COUNT) * (STRIDE)
      +	array3_size(COUNT, STRIDE, sizeof(THING))
        , ...)
      |
        kzalloc(
      -	sizeof(THING) * (COUNT) * STRIDE
      +	array3_size(COUNT, STRIDE, sizeof(THING))
        , ...)
      |
        kzalloc(
      -	sizeof(THING) * COUNT * (STRIDE)
      +	array3_size(COUNT, STRIDE, sizeof(THING))
        , ...)
      |
        kzalloc(
      -	sizeof(THING) * COUNT * STRIDE
      +	array3_size(COUNT, STRIDE, sizeof(THING))
        , ...)
      )
      
      // 3-factor product with 2 sizeof(variable), with redundant parens removed.
      @@
      expression THING1, THING2;
      identifier COUNT;
      type TYPE1, TYPE2;
      @@
      
      (
        kzalloc(
      -	sizeof(TYPE1) * sizeof(TYPE2) * COUNT
      +	array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
        , ...)
      |
        kzalloc(
      -	sizeof(TYPE1) * sizeof(THING2) * (COUNT)
      +	array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
        , ...)
      |
        kzalloc(
      -	sizeof(THING1) * sizeof(THING2) * COUNT
      +	array3_size(COUNT, sizeof(THING1), sizeof(THING2))
        , ...)
      |
        kzalloc(
      -	sizeof(THING1) * sizeof(THING2) * (COUNT)
      +	array3_size(COUNT, sizeof(THING1), sizeof(THING2))
        , ...)
      |
        kzalloc(
      -	sizeof(TYPE1) * sizeof(THING2) * COUNT
      +	array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
        , ...)
      |
        kzalloc(
      -	sizeof(TYPE1) * sizeof(THING2) * (COUNT)
      +	array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
        , ...)
      )
      
      // 3-factor product, only identifiers, with redundant parens removed.
      @@
      identifier STRIDE, SIZE, COUNT;
      @@
      
      (
        kzalloc(
      -	(COUNT) * STRIDE * SIZE
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kzalloc(
      -	COUNT * (STRIDE) * SIZE
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kzalloc(
      -	COUNT * STRIDE * (SIZE)
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kzalloc(
      -	(COUNT) * (STRIDE) * SIZE
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kzalloc(
      -	COUNT * (STRIDE) * (SIZE)
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kzalloc(
      -	(COUNT) * STRIDE * (SIZE)
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kzalloc(
      -	(COUNT) * (STRIDE) * (SIZE)
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kzalloc(
      -	COUNT * STRIDE * SIZE
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      )
      
      // Any remaining multi-factor products, first at least 3-factor products,
      // when they're not all constants...
      @@
      expression E1, E2, E3;
      constant C1, C2, C3;
      @@
      
      (
        kzalloc(C1 * C2 * C3, ...)
      |
        kzalloc(
      -	(E1) * E2 * E3
      +	array3_size(E1, E2, E3)
        , ...)
      |
        kzalloc(
      -	(E1) * (E2) * E3
      +	array3_size(E1, E2, E3)
        , ...)
      |
        kzalloc(
      -	(E1) * (E2) * (E3)
      +	array3_size(E1, E2, E3)
        , ...)
      |
        kzalloc(
      -	E1 * E2 * E3
      +	array3_size(E1, E2, E3)
        , ...)
      )
      
      // And then all remaining 2 factors products when they're not all constants,
      // keeping sizeof() as the second factor argument.
      @@
      expression THING, E1, E2;
      type TYPE;
      constant C1, C2, C3;
      @@
      
      (
        kzalloc(sizeof(THING) * C2, ...)
      |
        kzalloc(sizeof(TYPE) * C2, ...)
      |
        kzalloc(C1 * C2 * C3, ...)
      |
        kzalloc(C1 * C2, ...)
      |
      - kzalloc
      + kcalloc
        (
      -	sizeof(TYPE) * (E2)
      +	E2, sizeof(TYPE)
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	sizeof(TYPE) * E2
      +	E2, sizeof(TYPE)
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	sizeof(THING) * (E2)
      +	E2, sizeof(THING)
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	sizeof(THING) * E2
      +	E2, sizeof(THING)
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	(E1) * E2
      +	E1, E2
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	(E1) * (E2)
      +	E1, E2
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	E1 * E2
      +	E1, E2
        , ...)
      )
      Signed-off-by: 's avatarKees Cook <keescook@chromium.org>
      6396bb22
  4. 05 Jun, 2018 1 commit
    • Deepa Dinamani's avatar
      vfs: change inode times to use struct timespec64 · 95582b00
      Deepa Dinamani authored
      struct timespec is not y2038 safe. Transition vfs to use
      y2038 safe struct timespec64 instead.
      
      The change was made with the help of the following cocinelle
      script. This catches about 80% of the changes.
      All the header file and logic changes are included in the
      first 5 rules. The rest are trivial substitutions.
      I avoid changing any of the function signatures or any other
      filesystem specific data structures to keep the patch simple
      for review.
      
      The script can be a little shorter by combining different cases.
      But, this version was sufficient for my usecase.
      
      virtual patch
      
      @ depends on patch @
      identifier now;
      @@
      - struct timespec
      + struct timespec64
        current_time ( ... )
        {
      - struct timespec now = current_kernel_time();
      + struct timespec64 now = current_kernel_time64();
        ...
      - return timespec_trunc(
      + return timespec64_trunc(
        ... );
        }
      
      @ depends on patch @
      identifier xtime;
      @@
       struct \( iattr \| inode \| kstat \) {
       ...
      -       struct timespec xtime;
      +       struct timespec64 xtime;
       ...
       }
      
      @ depends on patch @
      identifier t;
      @@
       struct inode_operations {
       ...
      int (*update_time) (...,
      -       struct timespec t,
      +       struct timespec64 t,
      ...);
       ...
       }
      
      @ depends on patch @
      identifier t;
      identifier fn_update_time =~ "update_time$";
      @@
       fn_update_time (...,
      - struct timespec *t,
      + struct timespec64 *t,
       ...) { ... }
      
      @ depends on patch @
      identifier t;
      @@
      lease_get_mtime( ... ,
      - struct timespec *t
      + struct timespec64 *t
        ) { ... }
      
      @te depends on patch forall@
      identifier ts;
      local idexpression struct inode *inode_node;
      identifier i_xtime =~ "^i_[acm]time$";
      identifier ia_xtime =~ "^ia_[acm]time$";
      identifier fn_update_time =~ "update_time$";
      identifier fn;
      expression e, E3;
      local idexpression struct inode *node1;
      local idexpression struct inode *node2;
      local idexpression struct iattr *attr1;
      local idexpression struct iattr *attr2;
      local idexpression struct iattr attr;
      identifier i_xtime1 =~ "^i_[acm]time$";
      identifier i_xtime2 =~ "^i_[acm]time$";
      identifier ia_xtime1 =~ "^ia_[acm]time$";
      identifier ia_xtime2 =~ "^ia_[acm]time$";
      @@
      (
      (
      - struct timespec ts;
      + struct timespec64 ts;
      |
      - struct timespec ts = current_time(inode_node);
      + struct timespec64 ts = current_time(inode_node);
      )
      
      <+... when != ts
      (
      - timespec_equal(&inode_node->i_xtime, &ts)
      + timespec64_equal(&inode_node->i_xtime, &ts)
      |
      - timespec_equal(&ts, &inode_node->i_xtime)
      + timespec64_equal(&ts, &inode_node->i_xtime)
      |
      - timespec_compare(&inode_node->i_xtime, &ts)
      + timespec64_compare(&inode_node->i_xtime, &ts)
      |
      - timespec_compare(&ts, &inode_node->i_xtime)
      + timespec64_compare(&ts, &inode_node->i_xtime)
      |
      ts = current_time(e)
      |
      fn_update_time(..., &ts,...)
      |
      inode_node->i_xtime = ts
      |
      node1->i_xtime = ts
      |
      ts = inode_node->i_xtime
      |
      <+... attr1->ia_xtime ...+> = ts
      |
      ts = attr1->ia_xtime
      |
      ts.tv_sec
      |
      ts.tv_nsec
      |
      btrfs_set_stack_timespec_sec(..., ts.tv_sec)
      |
      btrfs_set_stack_timespec_nsec(..., ts.tv_nsec)
      |
      - ts = timespec64_to_timespec(
      + ts =
      ...
      -)
      |
      - ts = ktime_to_timespec(
      + ts = ktime_to_timespec64(
      ...)
      |
      - ts = E3
      + ts = timespec_to_timespec64(E3)
      |
      - ktime_get_real_ts(&ts)
      + ktime_get_real_ts64(&ts)
      |
      fn(...,
      - ts
      + timespec64_to_timespec(ts)
      ,...)
      )
      ...+>
      (
      <... when != ts
      - return ts;
      + return timespec64_to_timespec(ts);
      ...>
      )
      |
      - timespec_equal(&node1->i_xtime1, &node2->i_xtime2)
      + timespec64_equal(&node1->i_xtime2, &node2->i_xtime2)
      |
      - timespec_equal(&node1->i_xtime1, &attr2->ia_xtime2)
      + timespec64_equal(&node1->i_xtime2, &attr2->ia_xtime2)
      |
      - timespec_compare(&node1->i_xtime1, &node2->i_xtime2)
      + timespec64_compare(&node1->i_xtime1, &node2->i_xtime2)
      |
      node1->i_xtime1 =
      - timespec_trunc(attr1->ia_xtime1,
      + timespec64_trunc(attr1->ia_xtime1,
      ...)
      |
      - attr1->ia_xtime1 = timespec_trunc(attr2->ia_xtime2,
      + attr1->ia_xtime1 =  timespec64_trunc(attr2->ia_xtime2,
      ...)
      |
      - ktime_get_real_ts(&attr1->ia_xtime1)
      + ktime_get_real_ts64(&attr1->ia_xtime1)
      |
      - ktime_get_real_ts(&attr.ia_xtime1)
      + ktime_get_real_ts64(&attr.ia_xtime1)
      )
      
      @ depends on patch @
      struct inode *node;
      struct iattr *attr;
      identifier fn;
      identifier i_xtime =~ "^i_[acm]time$";
      identifier ia_xtime =~ "^ia_[acm]time$";
      expression e;
      @@
      (
      - fn(node->i_xtime);
      + fn(timespec64_to_timespec(node->i_xtime));
      |
       fn(...,
      - node->i_xtime);
      + timespec64_to_timespec(node->i_xtime));
      |
      - e = fn(attr->ia_xtime);
      + e = fn(timespec64_to_timespec(attr->ia_xtime));
      )
      
      @ depends on patch forall @
      struct inode *node;
      struct iattr *attr;
      identifier i_xtime =~ "^i_[acm]time$";
      identifier ia_xtime =~ "^ia_[acm]time$";
      identifier fn;
      @@
      {
      + struct timespec ts;
      <+...
      (
      + ts = timespec64_to_timespec(node->i_xtime);
      fn (...,
      - &node->i_xtime,
      + &ts,
      ...);
      |
      + ts = timespec64_to_timespec(attr->ia_xtime);
      fn (...,
      - &attr->ia_xtime,
      + &ts,
      ...);
      )
      ...+>
      }
      
      @ depends on patch forall @
      struct inode *node;
      struct iattr *attr;
      struct kstat *stat;
      identifier ia_xtime =~ "^ia_[acm]time$";
      identifier i_xtime =~ "^i_[acm]time$";
      identifier xtime =~ "^[acm]time$";
      identifier fn, ret;
      @@
      {
      + struct timespec ts;
      <+...
      (
      + ts = timespec64_to_timespec(node->i_xtime);
      ret = fn (...,
      - &node->i_xtime,
      + &ts,
      ...);
      |
      + ts = timespec64_to_timespec(node->i_xtime);
      ret = fn (...,
      - &node->i_xtime);
      + &ts);
      |
      + ts = timespec64_to_timespec(attr->ia_xtime);
      ret = fn (...,
      - &attr->ia_xtime,
      + &ts,
      ...);
      |
      + ts = timespec64_to_timespec(attr->ia_xtime);
      ret = fn (...,
      - &attr->ia_xtime);
      + &ts);
      |
      + ts = timespec64_to_timespec(stat->xtime);
      ret = fn (...,
      - &stat->xtime);
      + &ts);
      )
      ...+>
      }
      
      @ depends on patch @
      struct inode *node;
      struct inode *node2;
      identifier i_xtime1 =~ "^i_[acm]time$";
      identifier i_xtime2 =~ "^i_[acm]time$";
      identifier i_xtime3 =~ "^i_[acm]time$";
      struct iattr *attrp;
      struct iattr *attrp2;
      struct iattr attr ;
      identifier ia_xtime1 =~ "^ia_[acm]time$";
      identifier ia_xtime2 =~ "^ia_[acm]time$";
      struct kstat *stat;
      struct kstat stat1;
      struct timespec64 ts;
      identifier xtime =~ "^[acmb]time$";
      expression e;
      @@
      (
      ( node->i_xtime2 \| attrp->ia_xtime2 \| attr.ia_xtime2 \) = node->i_xtime1  ;
      |
       node->i_xtime2 = \( node2->i_xtime1 \| timespec64_trunc(...) \);
      |
       node->i_xtime2 = node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
      |
       node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
      |
       stat->xtime = node2->i_xtime1;
      |
       stat1.xtime = node2->i_xtime1;
      |
      ( node->i_xtime2 \| attrp->ia_xtime2 \) = attrp->ia_xtime1  ;
      |
      ( attrp->ia_xtime1 \| attr.ia_xtime1 \) = attrp2->ia_xtime2;
      |
      - e = node->i_xtime1;
      + e = timespec64_to_timespec( node->i_xtime1 );
      |
      - e = attrp->ia_xtime1;
      + e = timespec64_to_timespec( attrp->ia_xtime1 );
      |
      node->i_xtime1 = current_time(...);
      |
       node->i_xtime2 = node->i_xtime1 = node->i_xtime3 =
      - e;
      + timespec_to_timespec64(e);
      |
       node->i_xtime1 = node->i_xtime3 =
      - e;
      + timespec_to_timespec64(e);
      |
      - node->i_xtime1 = e;
      + node->i_xtime1 = timespec_to_timespec64(e);
      )
      Signed-off-by: 's avatarDeepa Dinamani <deepa.kernel@gmail.com>
      Cc: <anton@tuxera.com>
      Cc: <balbi@kernel.org>
      Cc: <bfields@fieldses.org>
      Cc: <darrick.wong@oracle.com>
      Cc: <dhowells@redhat.com>
      Cc: <dsterba@suse.com>
      Cc: <dwmw2@infradead.org>
      Cc: <hch@lst.de>
      Cc: <hirofumi@mail.parknet.co.jp>
      Cc: <hubcap@omnibond.com>
      Cc: <jack@suse.com>
      Cc: <jaegeuk@kernel.org>
      Cc: <jaharkes@cs.cmu.edu>
      Cc: <jslaby@suse.com>
      Cc: <keescook@chromium.org>
      Cc: <mark@fasheh.com>
      Cc: <miklos@szeredi.hu>
      Cc: <nico@linaro.org>
      Cc: <reiserfs-devel@vger.kernel.org>
      Cc: <richard@nod.at>
      Cc: <sage@redhat.com>
      Cc: <sfrench@samba.org>
      Cc: <swhiteho@redhat.com>
      Cc: <tj@kernel.org>
      Cc: <trond.myklebust@primarydata.com>
      Cc: <tytso@mit.edu>
      Cc: <viro@zeniv.linux.org.uk>
      95582b00
  5. 31 May, 2018 11 commits
  6. 12 Apr, 2018 18 commits
    • Amir Goldstein's avatar
      ovl: add support for "xino" mount and config options · 795939a9
      Amir Goldstein authored
      With mount option "xino=on", mounter declares that there are enough
      free high bits in underlying fs to hold the layer fsid.
      If overlayfs does encounter underlying inodes using the high xino
      bits reserved for layer fsid, a warning will be emitted and the original
      inode number will be used.
      
      The mount option name "xino" goes after a similar meaning mount option
      of aufs, but in overlayfs case, the mapping is stateless.
      
      An example for a use case of "xino=on" is when upper/lower is on an xfs
      filesystem. xfs uses 64bit inode numbers, but it currently never uses the
      upper 8bit for inode numbers exposed via stat(2) and that is not likely to
      change in the future without user opting-in for a new xfs feature. The
      actual number of unused upper bit is much larger and determined by the xfs
      filesystem geometry (64 - agno_log - agblklog - inopblog). That means
      that for all practical purpose, there are enough unused bits in xfs
      inode numbers for more than OVL_MAX_STACK unique fsid's.
      
      Another use case of "xino=on" is when upper/lower is on tmpfs. tmpfs inode
      numbers are allocated sequentially since boot, so they will practially
      never use the high inode number bits.
      
      For compatibility with applications that expect 32bit inodes, the feature
      can be disabled with "xino=off". The option "xino=auto" automatically
      detects underlying filesystem that use 32bit inodes and enables the
      feature. The Kconfig option OVERLAY_FS_XINO_AUTO and module parameter of
      the same name, determine if the default mode for overlayfs mount is
      "xino=auto" or "xino=off".
      Signed-off-by: 's avatarAmir Goldstein <amir73il@gmail.com>
      Signed-off-by: 's avatarMiklos Szeredi <mszeredi@redhat.com>
      795939a9
    • Amir Goldstein's avatar
      ovl: consistent d_ino for non-samefs with xino · adbf4f7e
      Amir Goldstein authored
      When overlay layers are not all on the same fs, but all inode numbers
      of underlying fs do not use the high 'xino' bits, overlay st_ino values
      are constant and persistent.
      
      In that case, relax non-samefs constraint for consistent d_ino and always
      iterate non-merge dir using ovl_fill_real() actor so we can remap lower
      inode numbers to unique lower fs range.
      Signed-off-by: 's avatarAmir Goldstein <amir73il@gmail.com>
      Signed-off-by: 's avatarMiklos Szeredi <mszeredi@redhat.com>
      adbf4f7e
    • Amir Goldstein's avatar
      ovl: consistent i_ino for non-samefs with xino · 12574a9f
      Amir Goldstein authored
      When overlay layers are not all on the same fs, but all inode numbers
      of underlying fs do not use the high 'xino' bits, overlay st_ino values
      are constant and persistent.
      
      In that case, set i_ino value to the same value as st_ino for nfsd
      readdirplus validator.
      Signed-off-by: 's avatarAmir Goldstein <amir73il@gmail.com>
      Signed-off-by: 's avatarMiklos Szeredi <mszeredi@redhat.com>
      12574a9f
    • Amir Goldstein's avatar
      ovl: constant st_ino for non-samefs with xino · e487d889
      Amir Goldstein authored
      On 64bit systems, when overlay layers are not all on the same fs, but
      all inode numbers of underlying fs are not using the high bits, use the
      high bits to partition the overlay st_ino address space.  The high bits
      hold the fsid (upper fsid is 0).  This way overlay inode numbers are unique
      and all inodes use overlay st_dev.  Inode numbers are also persistent
      for a given layer configuration.
      
      Currently, our only indication for available high ino bits is from a
      filesystem that supports file handles and uses the default encode_fh()
      operation, which encodes a 32bit inode number.
      Signed-off-by: 's avatarAmir Goldstein <amir73il@gmail.com>
      Signed-off-by: 's avatarMiklos Szeredi <mszeredi@redhat.com>
      e487d889
    • Amir Goldstein's avatar
      ovl: allocate anon bdev per unique lower fs · 5148626b
      Amir Goldstein authored
      Instead of allocating an anonymous bdev per lower layer, allocate
      one anonymous bdev per every unique lower fs that is different than
      upper fs.
      
      Every unique lower fs is assigned an fsid > 0 and the number of
      unique lower fs are stored in ofs->numlowerfs.
      
      The assigned fsid is stored in the lower layer struct and will be
      used also for inode number multiplexing.
      Signed-off-by: 's avatarAmir Goldstein <amir73il@gmail.com>
      Signed-off-by: 's avatarMiklos Szeredi <mszeredi@redhat.com>
      5148626b
    • Amir Goldstein's avatar
      ovl: factor out ovl_map_dev_ino() helper · da309e8c
      Amir Goldstein authored
      A helper for ovl_getattr() to map the values of st_dev and st_ino
      according to constant st_ino rules.
      Signed-off-by: 's avatarAmir Goldstein <amir73il@gmail.com>
      Signed-off-by: 's avatarMiklos Szeredi <mszeredi@redhat.com>
      da309e8c
    • Miklos Szeredi's avatar
      ovl: cleanup ovl_update_time() · 8f35cf51
      Miklos Szeredi authored
      No need to mess with an alias, the upperdentry can be retrieved directly
      from the overlay inode.
      Signed-off-by: 's avatarMiklos Szeredi <mszeredi@redhat.com>
      8f35cf51
    • Miklos Szeredi's avatar
      3a291774
    • Vivek Goyal's avatar
      ovl: cleanup setting OVL_INDEX · 0471a9cd
      Vivek Goyal authored
      Signed-off-by: 's avatarMiklos Szeredi <mszeredi@redhat.com>
      0471a9cd
    • Vivek Goyal's avatar
      ovl: set d->is_dir and d->opaque for last path element · 102b0d11
      Vivek Goyal authored
      Certain properties in ovl_lookup_data should be set only for the last
      element of the path. IOW, if we are calling ovl_lookup_single() for an
      absolute redirect, then d->is_dir and d->opaque do not make much sense
      for intermediate path elements. Instead set them only if dentry being
      lookup is last path element.
      
      As of now we do not seem to be making use of d->opaque if it is set for
      a path/dentry in lower. But just define the semantics so that future code
      can make use of this assumption.
      Signed-off-by: 's avatarVivek Goyal <vgoyal@redhat.com>
      Signed-off-by: 's avatarMiklos Szeredi <mszeredi@redhat.com>
      102b0d11
    • Vivek Goyal's avatar
      ovl: Do not check for redirect if this is last layer · e9b77f90
      Vivek Goyal authored
      If we are looking in last layer, then there should not be any need to
      process redirect. redirect information is used only for lookup in next
      lower layer and there is no more lower layer to look into. So no need
      to process redirects.
      
      IOW, ignore redirects on lowest layer.
      Signed-off-by: 's avatarVivek Goyal <vgoyal@redhat.com>
      Reviewed-by: 's avatarAmir Goldstein <amir73il@gmail.com>
      Signed-off-by: 's avatarMiklos Szeredi <mszeredi@redhat.com>
      e9b77f90
    • Amir Goldstein's avatar
      ovl: lookup in inode cache first when decoding lower file handle · 8b58924a
      Amir Goldstein authored
      When decoding a lower file handle, we need to check if lower file was
      copied up and indexed and if it has a whiteout index, we need to check
      if this is an unlinked but open non-dir before returning -ESTALE.
      
      To find out if this is an unlinked but open non-dir we need to lookup
      an overlay inode in inode cache by lower inode and that requires decoding
      the lower file handle before looking in inode cache.
      
      Before this change, if the lower inode turned out to be a directory, we
      may have paid an expensive cost to reconnect that lower directory for
      nothing.
      
      After this change, we start by decoding a disconnected lower dentry and
      using the lower inode for looking up an overlay inode in inode cache.
      If we find overlay inode and dentry in cache, we avoid the index lookup
      overhead. If we don't find an overlay inode and dentry in cache, then we
      only need to decode a connected lower dentry in case the lower dentry is
      a non-indexed directory.
      
      The xfstests group overlay/exportfs tests decoding overlayfs file
      handles after drop_caches with different states of the file at encode
      and decode time. Overall the tests in the group call ovl_lower_fh_to_d()
      89 times to decode a lower file handle.
      
      Before this change, the tests called ovl_get_index_fh() 75 times and
      reconnect_one() 61 times.
      After this change, the tests call ovl_get_index_fh() 70 times and
      reconnect_one() 59 times. The 2 cases where reconnect_one() was avoided
      are cases where a non-upper directory file handle was encoded, then the
      directory removed and then file handle was decoded.
      
      To demonstrate the affect on decoding file handles with hot inode/dentry
      cache, the drop_caches call in the tests was disabled. Without
      drop_caches, there are no reconnect_one() calls at all before or after
      the change. Before the change, there are 75 calls to ovl_get_index_fh(),
      exactly as the case with drop_caches. After the change, there are only
      10 calls to ovl_get_index_fh().
      Signed-off-by: 's avatarAmir Goldstein <amir73il@gmail.com>
      Signed-off-by: 's avatarMiklos Szeredi <mszeredi@redhat.com>
      8b58924a
    • Amir Goldstein's avatar
      ovl: do not try to reconnect a disconnected origin dentry · 8a22efa1
      Amir Goldstein authored
      On lookup of non directory, we try to decode the origin file handle
      stored in upper inode. The origin file handle is supposed to be decoded
      to a disconnected non-dir dentry, which is fine, because we only need
      the lower inode of a copy up origin.
      
      However, if the origin file handle somehow turns out to be a directory
      we pay the expensive cost of reconnecting the directory dentry, only to
      get a mismatch file type and drop the dentry.
      
      Optimize this case by explicitly opting out of reconnecting the dentry.
      Opting-out of reconnect is done by passing a NULL acceptable callback
      to exportfs_decode_fh().
      
      While the case described above is a strange corner case that does not
      really need to be optimized, the API added for this optimization will
      be used by a following patch to optimize a more common case of decoding
      an overlayfs file handle.
      Signed-off-by: 's avatarAmir Goldstein <amir73il@gmail.com>
      Signed-off-by: 's avatarMiklos Szeredi <mszeredi@redhat.com>
      8a22efa1
    • Amir Goldstein's avatar
      ovl: disambiguate ovl_encode_fh() · 5b2cccd3
      Amir Goldstein authored
      Rename ovl_encode_fh() to ovl_encode_real_fh() to differentiate from the
      exportfs function ovl_encode_inode_fh() and change the latter to
      ovl_encode_fh() to match the exportfs method name.
      
      Rename ovl_decode_fh() to ovl_decode_real_fh() for consistency.
      Signed-off-by: 's avatarAmir Goldstein <amir73il@gmail.com>
      Signed-off-by: 's avatarMiklos Szeredi <mszeredi@redhat.com>
      5b2cccd3
    • Amir Goldstein's avatar
      ovl: set lower layer st_dev only if setting lower st_ino · 9f99e50d
      Amir Goldstein authored
      For broken hardlinks, we do not return lower st_ino, so we should
      also not return lower pseudo st_dev.
      
      Fixes: a0c5ad30 ("ovl: relax same fs constraint for constant st_ino")
      Cc: <stable@vger.kernel.org> #v4.15
      Signed-off-by: 's avatarAmir Goldstein <amir73il@gmail.com>
      Signed-off-by: 's avatarMiklos Szeredi <mszeredi@redhat.com>
      9f99e50d
    • Amir Goldstein's avatar
      ovl: fix lookup with middle layer opaque dir and absolute path redirects · 3ec9b3fa
      Amir Goldstein authored
      As of now if we encounter an opaque dir while looking for a dentry, we set
      d->last=true. This means that there is no need to look further in any of
      the lower layers. This works fine as long as there are no redirets or
      relative redircts. But what if there is an absolute redirect on the
      children dentry of opaque directory. We still need to continue to look into
      next lower layer. This patch fixes it.
      
      Here is an example to demonstrate the issue. Say you have following setup.
      
      upper:  /redirect (redirect=/a/b/c)
      lower1: /a/[b]/c       ([b] is opaque) (c has absolute redirect=/a/b/d/)
      lower0: /a/b/d/foo
      
      Now "redirect" dir should merge with lower1:/a/b/c/ and lower0:/a/b/d.
      Note, despite the fact lower1:/a/[b] is opaque, we need to continue to look
      into lower0 because children c has an absolute redirect.
      
      Following is a reproducer.
      
      Watch me make foo disappear:
      
       $ mkdir lower middle upper work work2 merged
       $ mkdir lower/origin
       $ touch lower/origin/foo
       $ mount -t overlay none merged/ \
               -olowerdir=lower,upperdir=middle,workdir=work2
       $ mkdir merged/pure
       $ mv merged/origin merged/pure/redirect
       $ umount merged
       $ mount -t overlay none merged/ \
               -olowerdir=middle:lower,upperdir=upper,workdir=work
       $ mv merged/pure/redirect merged/redirect
      
      Now you see foo inside a twice redirected merged dir:
      
       $ ls merged/redirect
       foo
       $ umount merged
       $ mount -t overlay none merged/ \
               -olowerdir=middle:lower,upperdir=upper,workdir=work
      
      After mount cycle you don't see foo inside the same dir:
      
       $ ls merged/redirect
      
      During middle layer lookup, the opaqueness of middle/pure is left in
      the lookup state and then middle/pure/redirect is wrongly treated as
      opaque.
      
      Fixes: 02b69b28 ("ovl: lookup redirects")
      Cc: <stable@vger.kernel.org> #v4.10
      Signed-off-by: 's avatarAmir Goldstein <amir73il@gmail.com>
      Signed-off-by: 's avatarVivek Goyal <vgoyal@redhat.com>
      Signed-off-by: 's avatarMiklos Szeredi <mszeredi@redhat.com>
      3ec9b3fa
    • Vivek Goyal's avatar
      ovl: Set d->last properly during lookup · 452061fd
      Vivek Goyal authored
      d->last signifies that this is the last layer we are looking into and there
      is no more. And that means this allows for some optimzation opportunities
      during lookup. For example, in ovl_lookup_single() we don't have to check
      for opaque xattr of a directory is this is the last layer we are looking
      into (d->last = true).
      
      But knowing for sure whether we are looking into last layer can be very
      tricky. If redirects are not enabled, then we can look at poe->numlower and
      figure out if the lookup we are about to is last layer or not. But if
      redircts are enabled then it is possible poe->numlower suggests that we are
      looking in last layer, but there is an absolute redirect present in found
      element and that redirects us to a layer in root and that means lookup will
      continue in lower layers further.
      
      For example, consider following.
      
      /upperdir/pure (opaque=y)
      /upperdir/pure/foo (opaque=y,redirect=/bar)
      /lowerdir/bar
      
      In this case pure is "pure upper". When we look for "foo", that time
      poe->numlower=0. But that alone does not mean that we will not search for a
      merge candidate in /lowerdir. Absolute redirect changes that.
      
      IOW, d->last should not be set just based on poe->numlower if redirects are
      enabled. That can lead to setting d->last while it should not have and that
      means we will not check for opaque xattr while we should have.
      
      So do this.
      
       - If redirects are not enabled, then continue to rely on poe->numlower
         information to determine if it is last layer or not.
      
       - If redirects are enabled, then set d->last = true only if this is the
         last layer in root ovl_entry (roe).
      Suggested-by: 's avatarAmir Goldstein <amir73il@gmail.com>
      Reviewed-by: 's avatarAmir Goldstein <amir73il@gmail.com>
      Signed-off-by: 's avatarVivek Goyal <vgoyal@redhat.com>
      Signed-off-by: 's avatarMiklos Szeredi <mszeredi@redhat.com>
      Fixes: 02b69b28 ("ovl: lookup redirects")
      Cc: <stable@vger.kernel.org> #v4.10
      452061fd
    • Amir Goldstein's avatar
      ovl: set i_ino to the value of st_ino for NFS export · 695b46e7
      Amir Goldstein authored
      Eddie Horng reported that readdir of an overlayfs directory that
      was exported via NFSv3 returns entries with d_type set to DT_UNKNOWN.
      The reason is that while preparing the response for readdirplus, nfsd
      checks inside encode_entryplus_baggage() that a child dentry's inode
      number matches the value of d_ino returns by overlayfs readdir iterator.
      
      Because the overlayfs inodes use arbitrary inode numbers that are not
      correlated with the values of st_ino/d_ino, NFSv3 falls back to not
      encoding d_type. Although this is an allowed behavior, we can fix it for
      the case of all overlayfs layers on the same underlying filesystem.
      
      When NFS export is enabled and d_ino is consistent with st_ino
      (samefs), set the same value also to i_ino in ovl_fill_inode() for all
      overlayfs inodes, nfsd readdirplus sanity checks will pass.
      ovl_fill_inode() may be called from ovl_new_inode(), before real inode
      was created with ino arg 0. In that case, i_ino will be updated to real
      upper inode i_ino on ovl_inode_init() or ovl_inode_update().
      Reported-by: 's avatarEddie Horng <eddiehorng.tw@gmail.com>
      Tested-by: 's avatarEddie Horng <eddiehorng.tw@gmail.com>
      Signed-off-by: 's avatarAmir Goldstein <amir73il@gmail.com>
      Fixes: 8383f174 ("ovl: wire up NFS export operations")
      Cc: <stable@vger.kernel.org> #v4.16
      Signed-off-by: 's avatarMiklos Szeredi <mszeredi@redhat.com>
      695b46e7
  7. 07 Mar, 2018 1 commit
  8. 26 Feb, 2018 2 commits
    • Vivek Goyal's avatar
      ovl: redirect_dir=nofollow should not follow redirect for opaque lower · d1fe96c0
      Vivek Goyal authored
      redirect_dir=nofollow should not follow a redirect. But in a specific
      configuration it can still follow it.  For example try this.
      
      $ mkdir -p lower0 lower1/foo upper work merged
      $ touch lower1/foo/lower-file.txt
      $ setfattr -n "trusted.overlay.opaque" -v "y" lower1/foo
      $ mount -t overlay -o lowerdir=lower1:lower0,workdir=work,upperdir=upper,redirect_dir=on none merged
      $ cd merged
      $ mv foo foo-renamed
      $ umount merged
      
      # mount again. This time with redirect_dir=nofollow
      $ mount -t overlay -o lowerdir=lower1:lower0,workdir=work,upperdir=upper,redirect_dir=nofollow none merged
      $ ls merged/foo-renamed/
      # This lists lower-file.txt, while it should not have.
      
      Basically, we are doing redirect check after we check for d.stop. And
      if this is not last lower, and we find an opaque lower, d.stop will be
      set.
      
      ovl_lookup_single()
              if (!d->last && ovl_is_opaquedir(this)) {
                      d->stop = d->opaque = true;
                      goto out;
              }
      
      To fix this, first check redirect is allowed. And after that check if
      d.stop has been set or not.
      Signed-off-by: 's avatarVivek Goyal <vgoyal@redhat.com>
      Fixes: 438c84c2 ("ovl: don't follow redirects if redirect_dir=off")
      Cc: <stable@vger.kernel.org> #v4.15
      Signed-off-by: 's avatarMiklos Szeredi <mszeredi@redhat.com>
      d1fe96c0
    • Fengguang Wu's avatar
      ovl: fix ptr_ret.cocci warnings · b5095f24
      Fengguang Wu authored
      fs/overlayfs/export.c:459:10-16: WARNING: PTR_ERR_OR_ZERO can be used
      
       Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR
      
      Generated by: scripts/coccinelle/api/ptr_ret.cocci
      
      Fixes: 4b91c30a ("ovl: lookup connected ancestor of dir in inode cache")
      CC: Amir Goldstein <amir73il@gmail.com>
      Signed-off-by: 's avatarFengguang Wu <fengguang.wu@intel.com>
      Signed-off-by: 's avatarMiklos Szeredi <mszeredi@redhat.com>
      b5095f24
  9. 16 Feb, 2018 3 commits
    • Amir Goldstein's avatar
      ovl: check ERR_PTR() return value from ovl_lookup_real() · 7168179f
      Amir Goldstein authored
      Reported-by: 's avatarDan Carpenter <dan.carpenter@oracle.com>
      Fixes: 06170154 ("ovl: lookup indexed ancestor of lower dir")
      Signed-off-by: 's avatarAmir Goldstein <amir73il@gmail.com>
      Signed-off-by: 's avatarMiklos Szeredi <mszeredi@redhat.com>
      7168179f
    • Amir Goldstein's avatar
      ovl: check lower ancestry on encode of lower dir file handle · 2ca3c148
      Amir Goldstein authored
      This change relaxes copy up on encode of merge dir with lower layer > 1
      and handles the case of encoding a merge dir with lower layer 1, where an
      ancestor is a non-indexed merge dir. In that case, decode of the lower
      file handle will not have been possible if the non-indexed ancestor is
      redirected before or after encode.
      
      Before encoding a non-upper directory file handle from real layer N, we
      need to check if it will be possible to reconnect an overlay dentry from
      the real lower decoded dentry. This is done by following the overlay
      ancestry up to a "layer N connected" ancestor and verifying that all
      parents along the way are "layer N connectable". If an ancestor that is
      NOT "layer N connectable" is found, we need to copy up an ancestor, which
      is "layer N connectable", thus making that ancestor "layer N connected".
      For example:
      
       layer 1: /a
       layer 2: /a/b/c
      
      The overlay dentry /a is NOT "layer 2 connectable", because if dir /a is
      copied up and renamed, upper dir /a will be indexed by lower dir /a from
      layer 1. The dir /a from layer 2 will never be indexed, so the algorithm
      in ovl_lookup_real_ancestor() (*) will not be able to lookup a connected
      overlay dentry from the connected lower dentry /a/b/c.
      
      To avoid this problem on decode time, we need to copy up an ancestor of
      /a/b/c, which is "layer 2 connectable", on encode time. That ancestor is
      /a/b. After copy up (and index) of /a/b, it will become "layer 2 connected"
      and when the time comes to decode the file handle from lower dentry /a/b/c,
      ovl_lookup_real_ancestor() will find the indexed ancestor /a/b and decoding
      a connected overlay dentry will be accomplished.
      
      (*) the algorithm in ovl_lookup_real_ancestor() can be improved to lookup
      an entry /a in the lower layers above layer N and find the indexed dir /a
      from layer 1. If that improvement is made, then the check for "layer N
      connected" will need to verify there are no redirects in lower layers above
      layer N. In the example above, /a will be "layer 2 connectable". However,
      if layer 2 dir /a is a target of a layer 1 redirect, then /a will NOT be
      "layer 2 connectable":
      
       layer 1: /A (redirect = /a)
       layer 2: /a/b/c
      Signed-off-by: 's avatarAmir Goldstein <amir73il@gmail.com>
      Signed-off-by: 's avatarMiklos Szeredi <mszeredi@redhat.com>
      2ca3c148
    • Amir Goldstein's avatar
      ovl: hash non-dir by lower inode for fsnotify · 764baba8
      Amir Goldstein authored
      Commit 31747eda ("ovl: hash directory inodes for fsnotify")
      fixed an issue of inotify watch on directory that stops getting
      events after dropping dentry caches.
      
      A similar issue exists for non-dir non-upper files, for example:
      
      $ mkdir -p lower upper work merged
      $ touch lower/foo
      $ mount -t overlay -o
      lowerdir=lower,workdir=work,upperdir=upper none merged
      $ inotifywait merged/foo &
      $ echo 2 > /proc/sys/vm/drop_caches
      $ cat merged/foo
      
      inotifywait doesn't get the OPEN event, because ovl_lookup() called
      from 'cat' allocates a new overlay inode and does not reuse the
      watched inode.
      
      Fix this by hashing non-dir overlay inodes by lower real inode in
      the following cases that were not hashed before this change:
       - A non-upper overlay mount
       - A lower non-hardlink when index=off
      
      A helper ovl_hash_bylower() was added to put all the logic and
      documentation about which real inode an overlay inode is hashed by
      into one place.
      
      The issue dates back to initial version of overlayfs, but this
      patch depends on ovl_inode code that was introduced in kernel v4.13.
      
      Cc: <stable@vger.kernel.org> #v4.13
      Signed-off-by: 's avatarAmir Goldstein <amir73il@gmail.com>
      Signed-off-by: 's avatarMiklos Szeredi <mszeredi@redhat.com>
      764baba8
  10. 05 Feb, 2018 1 commit