Home | History | Annotate | Line # | Download | only in io
      1  1.1    cherry /******************************************************************************
      2  1.1    cherry  * blkif.h
      3  1.1    cherry  *
      4  1.1    cherry  * Unified block-device I/O interface for Xen guest OSes.
      5  1.1    cherry  *
      6  1.1    cherry  * Permission is hereby granted, free of charge, to any person obtaining a copy
      7  1.1    cherry  * of this software and associated documentation files (the "Software"), to
      8  1.1    cherry  * deal in the Software without restriction, including without limitation the
      9  1.1    cherry  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
     10  1.1    cherry  * sell copies of the Software, and to permit persons to whom the Software is
     11  1.1    cherry  * furnished to do so, subject to the following conditions:
     12  1.1    cherry  *
     13  1.1    cherry  * The above copyright notice and this permission notice shall be included in
     14  1.1    cherry  * all copies or substantial portions of the Software.
     15  1.1    cherry  *
     16  1.1    cherry  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  1.1    cherry  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  1.1    cherry  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     19  1.1    cherry  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20  1.1    cherry  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     21  1.1    cherry  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     22  1.1    cherry  * DEALINGS IN THE SOFTWARE.
     23  1.1    cherry  *
     24  1.1    cherry  * Copyright (c) 2003-2004, Keir Fraser
     25  1.1    cherry  * Copyright (c) 2012, Spectra Logic Corporation
     26  1.1    cherry  */
     27  1.1    cherry 
     28  1.1    cherry #ifndef __XEN_PUBLIC_IO_BLKIF_H__
     29  1.1    cherry #define __XEN_PUBLIC_IO_BLKIF_H__
     30  1.1    cherry 
     31  1.1    cherry #include "ring.h"
     32  1.1    cherry #include "../grant_table.h"
     33  1.1    cherry 
     34  1.1    cherry /*
     35  1.1    cherry  * Front->back notifications: When enqueuing a new request, sending a
     36  1.1    cherry  * notification can be made conditional on req_event (i.e., the generic
     37  1.1    cherry  * hold-off mechanism provided by the ring macros). Backends must set
     38  1.1    cherry  * req_event appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()).
     39  1.1    cherry  *
     40  1.1    cherry  * Back->front notifications: When enqueuing a new response, sending a
     41  1.1    cherry  * notification can be made conditional on rsp_event (i.e., the generic
     42  1.1    cherry  * hold-off mechanism provided by the ring macros). Frontends must set
     43  1.1    cherry  * rsp_event appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()).
     44  1.1    cherry  */
     45  1.1    cherry 
     46  1.1    cherry #ifndef blkif_vdev_t
     47  1.1    cherry #define blkif_vdev_t   uint16_t
     48  1.1    cherry #endif
     49  1.1    cherry #define blkif_sector_t uint64_t
     50  1.1    cherry 
     51  1.1    cherry /*
     52  1.1    cherry  * Feature and Parameter Negotiation
     53  1.1    cherry  * =================================
     54  1.1    cherry  * The two halves of a Xen block driver utilize nodes within the XenStore to
     55  1.1    cherry  * communicate capabilities and to negotiate operating parameters.  This
     56  1.1    cherry  * section enumerates these nodes which reside in the respective front and
     57  1.1    cherry  * backend portions of the XenStore, following the XenBus convention.
     58  1.1    cherry  *
     59  1.1    cherry  * All data in the XenStore is stored as strings.  Nodes specifying numeric
     60  1.1    cherry  * values are encoded in decimal.  Integer value ranges listed below are
     61  1.1    cherry  * expressed as fixed sized integer types capable of storing the conversion
     62  1.1    cherry  * of a properly formated node string, without loss of information.
     63  1.1    cherry  *
     64  1.1    cherry  * Any specified default value is in effect if the corresponding XenBus node
     65  1.1    cherry  * is not present in the XenStore.
     66  1.1    cherry  *
     67  1.1    cherry  * XenStore nodes in sections marked "PRIVATE" are solely for use by the
     68  1.1    cherry  * driver side whose XenBus tree contains them.
     69  1.1    cherry  *
     70  1.1    cherry  * XenStore nodes marked "DEPRECATED" in their notes section should only be
     71  1.1    cherry  * used to provide interoperability with legacy implementations.
     72  1.1    cherry  *
     73  1.1    cherry  * See the XenBus state transition diagram below for details on when XenBus
     74  1.1    cherry  * nodes must be published and when they can be queried.
     75  1.1    cherry  *
     76  1.1    cherry  *****************************************************************************
     77  1.1    cherry  *                            Backend XenBus Nodes
     78  1.1    cherry  *****************************************************************************
     79  1.1    cherry  *
     80  1.1    cherry  *------------------ Backend Device Identification (PRIVATE) ------------------
     81  1.1    cherry  *
     82  1.1    cherry  * mode
     83  1.1    cherry  *      Values:         "r" (read only), "w" (writable)
     84  1.1    cherry  *
     85  1.1    cherry  *      The read or write access permissions to the backing store to be
     86  1.1    cherry  *      granted to the frontend.
     87  1.1    cherry  *
     88  1.1    cherry  * params
     89  1.1    cherry  *      Values:         string
     90  1.1    cherry  *
     91  1.1    cherry  *      A free formatted string providing sufficient information for the
     92  1.1    cherry  *      hotplug script to attach the device and provide a suitable
     93  1.1    cherry  *      handler (ie: a block device) for blkback to use.
     94  1.1    cherry  *
     95  1.1    cherry  * physical-device
     96  1.1    cherry  *      Values:         "MAJOR:MINOR"
     97  1.1    cherry  *      Notes: 11
     98  1.1    cherry  *
     99  1.1    cherry  *      MAJOR and MINOR are the major number and minor number of the
    100  1.1    cherry  *      backing device respectively.
    101  1.1    cherry  *
    102  1.1    cherry  * physical-device-path
    103  1.1    cherry  *      Values:         path string
    104  1.1    cherry  *
    105  1.1    cherry  *      A string that contains the absolute path to the disk image. On
    106  1.1    cherry  *      NetBSD and Linux this is always a block device, while on FreeBSD
    107  1.1    cherry  *      it can be either a block device or a regular file.
    108  1.1    cherry  *
    109  1.1    cherry  * type
    110  1.1    cherry  *      Values:         "file", "phy", "tap"
    111  1.1    cherry  *
    112  1.1    cherry  *      The type of the backing device/object.
    113  1.1    cherry  *
    114  1.1    cherry  *
    115  1.1    cherry  * direct-io-safe
    116  1.1    cherry  *      Values:         0/1 (boolean)
    117  1.1    cherry  *      Default Value:  0
    118  1.1    cherry  *
    119  1.1    cherry  *      The underlying storage is not affected by the direct IO memory
    120  1.1    cherry  *      lifetime bug.  See:
    121  1.1    cherry  *        http://lists.xen.org/archives/html/xen-devel/2012-12/msg01154.html
    122  1.1    cherry  *
    123  1.1    cherry  *      Therefore this option gives the backend permission to use
    124  1.1    cherry  *      O_DIRECT, notwithstanding that bug.
    125  1.1    cherry  *
    126  1.1    cherry  *      That is, if this option is enabled, use of O_DIRECT is safe,
    127  1.1    cherry  *      in circumstances where we would normally have avoided it as a
    128  1.1    cherry  *      workaround for that bug.  This option is not relevant for all
    129  1.1    cherry  *      backends, and even not necessarily supported for those for
    130  1.1    cherry  *      which it is relevant.  A backend which knows that it is not
    131  1.1    cherry  *      affected by the bug can ignore this option.
    132  1.1    cherry  *
    133  1.1    cherry  *      This option doesn't require a backend to use O_DIRECT, so it
    134  1.1    cherry  *      should not be used to try to control the caching behaviour.
    135  1.1    cherry  *
    136  1.1    cherry  *--------------------------------- Features ---------------------------------
    137  1.1    cherry  *
    138  1.1    cherry  * feature-barrier
    139  1.1    cherry  *      Values:         0/1 (boolean)
    140  1.1    cherry  *      Default Value:  0
    141  1.1    cherry  *
    142  1.1    cherry  *      A value of "1" indicates that the backend can process requests
    143  1.1    cherry  *      containing the BLKIF_OP_WRITE_BARRIER request opcode.  Requests
    144  1.1    cherry  *      of this type may still be returned at any time with the
    145  1.1    cherry  *      BLKIF_RSP_EOPNOTSUPP result code.
    146  1.1    cherry  *
    147  1.1    cherry  * feature-flush-cache
    148  1.1    cherry  *      Values:         0/1 (boolean)
    149  1.1    cherry  *      Default Value:  0
    150  1.1    cherry  *
    151  1.1    cherry  *      A value of "1" indicates that the backend can process requests
    152  1.1    cherry  *      containing the BLKIF_OP_FLUSH_DISKCACHE request opcode.  Requests
    153  1.1    cherry  *      of this type may still be returned at any time with the
    154  1.1    cherry  *      BLKIF_RSP_EOPNOTSUPP result code.
    155  1.1    cherry  *
    156  1.1    cherry  * feature-discard
    157  1.1    cherry  *      Values:         0/1 (boolean)
    158  1.1    cherry  *      Default Value:  0
    159  1.1    cherry  *
    160  1.1    cherry  *      A value of "1" indicates that the backend can process requests
    161  1.1    cherry  *      containing the BLKIF_OP_DISCARD request opcode.  Requests
    162  1.1    cherry  *      of this type may still be returned at any time with the
    163  1.1    cherry  *      BLKIF_RSP_EOPNOTSUPP result code.
    164  1.1    cherry  *
    165  1.1    cherry  * feature-persistent
    166  1.1    cherry  *      Values:         0/1 (boolean)
    167  1.1    cherry  *      Default Value:  0
    168  1.1    cherry  *      Notes: 7
    169  1.1    cherry  *
    170  1.1    cherry  *      A value of "1" indicates that the backend can keep the grants used
    171  1.1    cherry  *      by the frontend driver mapped, so the same set of grants should be
    172  1.1    cherry  *      used in all transactions. The maximum number of grants the backend
    173  1.1    cherry  *      can map persistently depends on the implementation, but ideally it
    174  1.1    cherry  *      should be RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST. Using this
    175  1.1    cherry  *      feature the backend doesn't need to unmap each grant, preventing
    176  1.1    cherry  *      costly TLB flushes. The backend driver should only map grants
    177  1.1    cherry  *      persistently if the frontend supports it. If a backend driver chooses
    178  1.1    cherry  *      to use the persistent protocol when the frontend doesn't support it,
    179  1.1    cherry  *      it will probably hit the maximum number of persistently mapped grants
    180  1.1    cherry  *      (due to the fact that the frontend won't be reusing the same grants),
    181  1.1    cherry  *      and fall back to non-persistent mode. Backend implementations may
    182  1.1    cherry  *      shrink or expand the number of persistently mapped grants without
    183  1.1    cherry  *      notifying the frontend depending on memory constraints (this might
    184  1.1    cherry  *      cause a performance degradation).
    185  1.1    cherry  *
    186  1.1    cherry  *      If a backend driver wants to limit the maximum number of persistently
    187  1.1    cherry  *      mapped grants to a value less than RING_SIZE *
    188  1.1    cherry  *      BLKIF_MAX_SEGMENTS_PER_REQUEST a LRU strategy should be used to
    189  1.1    cherry  *      discard the grants that are less commonly used. Using a LRU in the
    190  1.1    cherry  *      backend driver paired with a LIFO queue in the frontend will
    191  1.1    cherry  *      allow us to have better performance in this scenario.
    192  1.1    cherry  *
    193  1.1    cherry  *----------------------- Request Transport Parameters ------------------------
    194  1.1    cherry  *
    195  1.1    cherry  * max-ring-page-order
    196  1.1    cherry  *      Values:         <uint32_t>
    197  1.1    cherry  *      Default Value:  0
    198  1.1    cherry  *      Notes:          1, 3
    199  1.1    cherry  *
    200  1.1    cherry  *      The maximum supported size of the request ring buffer in units of
    201  1.1    cherry  *      lb(machine pages). (e.g. 0 == 1 page,  1 = 2 pages, 2 == 4 pages,
    202  1.1    cherry  *      etc.).
    203  1.1    cherry  *
    204  1.1    cherry  * max-ring-pages
    205  1.1    cherry  *      Values:         <uint32_t>
    206  1.1    cherry  *      Default Value:  1
    207  1.1    cherry  *      Notes:          DEPRECATED, 2, 3
    208  1.1    cherry  *
    209  1.1    cherry  *      The maximum supported size of the request ring buffer in units of
    210  1.1    cherry  *      machine pages.  The value must be a power of 2.
    211  1.1    cherry  *
    212  1.1    cherry  *------------------------- Backend Device Properties -------------------------
    213  1.1    cherry  *
    214  1.1    cherry  * discard-enable
    215  1.1    cherry  *      Values:         0/1 (boolean)
    216  1.1    cherry  *      Default Value:  1
    217  1.1    cherry  *
    218  1.1    cherry  *      This optional property, set by the toolstack, instructs the backend
    219  1.1    cherry  *      to offer (or not to offer) discard to the frontend. If the property
    220  1.1    cherry  *      is missing the backend should offer discard if the backing storage
    221  1.1    cherry  *      actually supports it.
    222  1.1    cherry  *
    223  1.1    cherry  * discard-alignment
    224  1.1    cherry  *      Values:         <uint32_t>
    225  1.1    cherry  *      Default Value:  0
    226  1.1    cherry  *      Notes:          4, 5
    227  1.1    cherry  *
    228  1.1    cherry  *      The offset, in bytes from the beginning of the virtual block device,
    229  1.1    cherry  *      to the first, addressable, discard extent on the underlying device.
    230  1.1    cherry  *
    231  1.1    cherry  * discard-granularity
    232  1.1    cherry  *      Values:         <uint32_t>
    233  1.1    cherry  *      Default Value:  <"sector-size">
    234  1.1    cherry  *      Notes:          4
    235  1.1    cherry  *
    236  1.1    cherry  *      The size, in bytes, of the individually addressable discard extents
    237  1.1    cherry  *      of the underlying device.
    238  1.1    cherry  *
    239  1.1    cherry  * discard-secure
    240  1.1    cherry  *      Values:         0/1 (boolean)
    241  1.1    cherry  *      Default Value:  0
    242  1.1    cherry  *      Notes:          10
    243  1.1    cherry  *
    244  1.1    cherry  *      A value of "1" indicates that the backend can process BLKIF_OP_DISCARD
    245  1.1    cherry  *      requests with the BLKIF_DISCARD_SECURE flag set.
    246  1.1    cherry  *
    247  1.1    cherry  * info
    248  1.1    cherry  *      Values:         <uint32_t> (bitmap)
    249  1.1    cherry  *
    250  1.1    cherry  *      A collection of bit flags describing attributes of the backing
    251  1.1    cherry  *      device.  The VDISK_* macros define the meaning of each bit
    252  1.1    cherry  *      location.
    253  1.1    cherry  *
    254  1.1    cherry  * sector-size
    255  1.1    cherry  *      Values:         <uint32_t>
    256  1.1    cherry  *
    257  1.1    cherry  *      The logical sector size, in bytes, of the backend device.
    258  1.1    cherry  *
    259  1.1    cherry  * physical-sector-size
    260  1.1    cherry  *      Values:         <uint32_t>
    261  1.1    cherry  *
    262  1.1    cherry  *      The physical sector size, in bytes, of the backend device.
    263  1.1    cherry  *
    264  1.1    cherry  * sectors
    265  1.1    cherry  *      Values:         <uint64_t>
    266  1.1    cherry  *
    267  1.1    cherry  *      The size of the backend device, expressed in units of its logical
    268  1.1    cherry  *      sector size ("sector-size").
    269  1.1    cherry  *
    270  1.1    cherry  *****************************************************************************
    271  1.1    cherry  *                            Frontend XenBus Nodes
    272  1.1    cherry  *****************************************************************************
    273  1.1    cherry  *
    274  1.1    cherry  *----------------------- Request Transport Parameters -----------------------
    275  1.1    cherry  *
    276  1.1    cherry  * event-channel
    277  1.1    cherry  *      Values:         <uint32_t>
    278  1.1    cherry  *
    279  1.1    cherry  *      The identifier of the Xen event channel used to signal activity
    280  1.1    cherry  *      in the ring buffer.
    281  1.1    cherry  *
    282  1.1    cherry  * ring-ref
    283  1.1    cherry  *      Values:         <uint32_t>
    284  1.1    cherry  *      Notes:          6
    285  1.1    cherry  *
    286  1.1    cherry  *      The Xen grant reference granting permission for the backend to map
    287  1.1    cherry  *      the sole page in a single page sized ring buffer.
    288  1.1    cherry  *
    289  1.1    cherry  * ring-ref%u
    290  1.1    cherry  *      Values:         <uint32_t>
    291  1.1    cherry  *      Notes:          6
    292  1.1    cherry  *
    293  1.1    cherry  *      For a frontend providing a multi-page ring, a "number of ring pages"
    294  1.1    cherry  *      sized list of nodes, each containing a Xen grant reference granting
    295  1.1    cherry  *      permission for the backend to map the page of the ring located
    296  1.1    cherry  *      at page index "%u".  Page indexes are zero based.
    297  1.1    cherry  *
    298  1.1    cherry  * protocol
    299  1.1    cherry  *      Values:         string (XEN_IO_PROTO_ABI_*)
    300  1.1    cherry  *      Default Value:  XEN_IO_PROTO_ABI_NATIVE
    301  1.1    cherry  *
    302  1.1    cherry  *      The machine ABI rules governing the format of all ring request and
    303  1.1    cherry  *      response structures.
    304  1.1    cherry  *
    305  1.1    cherry  * ring-page-order
    306  1.1    cherry  *      Values:         <uint32_t>
    307  1.1    cherry  *      Default Value:  0
    308  1.1    cherry  *      Maximum Value:  MAX(ffs(max-ring-pages) - 1, max-ring-page-order)
    309  1.1    cherry  *      Notes:          1, 3
    310  1.1    cherry  *
    311  1.1    cherry  *      The size of the frontend allocated request ring buffer in units
    312  1.1    cherry  *      of lb(machine pages). (e.g. 0 == 1 page, 1 = 2 pages, 2 == 4 pages,
    313  1.1    cherry  *      etc.).
    314  1.1    cherry  *
    315  1.1    cherry  * num-ring-pages
    316  1.1    cherry  *      Values:         <uint32_t>
    317  1.1    cherry  *      Default Value:  1
    318  1.1    cherry  *      Maximum Value:  MAX(max-ring-pages,(0x1 << max-ring-page-order))
    319  1.1    cherry  *      Notes:          DEPRECATED, 2, 3
    320  1.1    cherry  *
    321  1.1    cherry  *      The size of the frontend allocated request ring buffer in units of
    322  1.1    cherry  *      machine pages.  The value must be a power of 2.
    323  1.1    cherry  *
    324  1.1    cherry  * feature-persistent
    325  1.1    cherry  *      Values:         0/1 (boolean)
    326  1.1    cherry  *      Default Value:  0
    327  1.1    cherry  *      Notes: 7, 8, 9
    328  1.1    cherry  *
    329  1.1    cherry  *      A value of "1" indicates that the frontend will reuse the same grants
    330  1.1    cherry  *      for all transactions, allowing the backend to map them with write
    331  1.1    cherry  *      access (even when it should be read-only). If the frontend hits the
    332  1.1    cherry  *      maximum number of allowed persistently mapped grants, it can fallback
    333  1.1    cherry  *      to non persistent mode. This will cause a performance degradation,
    334  1.1    cherry  *      since the the backend driver will still try to map those grants
    335  1.1    cherry  *      persistently. Since the persistent grants protocol is compatible with
    336  1.1    cherry  *      the previous protocol, a frontend driver can choose to work in
    337  1.1    cherry  *      persistent mode even when the backend doesn't support it.
    338  1.1    cherry  *
    339  1.1    cherry  *      It is recommended that the frontend driver stores the persistently
    340  1.1    cherry  *      mapped grants in a LIFO queue, so a subset of all persistently mapped
    341  1.1    cherry  *      grants gets used commonly. This is done in case the backend driver
    342  1.1    cherry  *      decides to limit the maximum number of persistently mapped grants
    343  1.1    cherry  *      to a value less than RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST.
    344  1.1    cherry  *
    345  1.1    cherry  *------------------------- Virtual Device Properties -------------------------
    346  1.1    cherry  *
    347  1.1    cherry  * device-type
    348  1.1    cherry  *      Values:         "disk", "cdrom", "floppy", etc.
    349  1.1    cherry  *
    350  1.1    cherry  * virtual-device
    351  1.1    cherry  *      Values:         <uint32_t>
    352  1.1    cherry  *
    353  1.1    cherry  *      A value indicating the physical device to virtualize within the
    354  1.1    cherry  *      frontend's domain.  (e.g. "The first ATA disk", "The third SCSI
    355  1.1    cherry  *      disk", etc.)
    356  1.1    cherry  *
    357  1.1    cherry  *      See docs/misc/vbd-interface.txt for details on the format of this
    358  1.1    cherry  *      value.
    359  1.1    cherry  *
    360  1.1    cherry  * Notes
    361  1.1    cherry  * -----
    362  1.1    cherry  * (1) Multi-page ring buffer scheme first developed in the Citrix XenServer
    363  1.1    cherry  *     PV drivers.
    364  1.1    cherry  * (2) Multi-page ring buffer scheme first used in some RedHat distributions
    365  1.1    cherry  *     including a distribution deployed on certain nodes of the Amazon
    366  1.1    cherry  *     EC2 cluster.
    367  1.1    cherry  * (3) Support for multi-page ring buffers was implemented independently,
    368  1.1    cherry  *     in slightly different forms, by both Citrix and RedHat/Amazon.
    369  1.1    cherry  *     For full interoperability, block front and backends should publish
    370  1.1    cherry  *     identical ring parameters, adjusted for unit differences, to the
    371  1.1    cherry  *     XenStore nodes used in both schemes.
    372  1.1    cherry  * (4) Devices that support discard functionality may internally allocate space
    373  1.1    cherry  *     (discardable extents) in units that are larger than the exported logical
    374  1.1    cherry  *     block size. If the backing device has such discardable extents the
    375  1.1    cherry  *     backend should provide both discard-granularity and discard-alignment.
    376  1.1    cherry  *     Providing just one of the two may be considered an error by the frontend.
    377  1.1    cherry  *     Backends supporting discard should include discard-granularity and
    378  1.1    cherry  *     discard-alignment even if it supports discarding individual sectors.
    379  1.1    cherry  *     Frontends should assume discard-alignment == 0 and discard-granularity
    380  1.1    cherry  *     == sector size if these keys are missing.
    381  1.1    cherry  * (5) The discard-alignment parameter allows a physical device to be
    382  1.1    cherry  *     partitioned into virtual devices that do not necessarily begin or
    383  1.1    cherry  *     end on a discardable extent boundary.
    384  1.1    cherry  * (6) When there is only a single page allocated to the request ring,
    385  1.1    cherry  *     'ring-ref' is used to communicate the grant reference for this
    386  1.1    cherry  *     page to the backend.  When using a multi-page ring, the 'ring-ref'
    387  1.1    cherry  *     node is not created.  Instead 'ring-ref0' - 'ring-refN' are used.
    388  1.1    cherry  * (7) When using persistent grants data has to be copied from/to the page
    389  1.1    cherry  *     where the grant is currently mapped. The overhead of doing this copy
    390  1.1    cherry  *     however doesn't suppress the speed improvement of not having to unmap
    391  1.1    cherry  *     the grants.
    392  1.1    cherry  * (8) The frontend driver has to allow the backend driver to map all grants
    393  1.1    cherry  *     with write access, even when they should be mapped read-only, since
    394  1.1    cherry  *     further requests may reuse these grants and require write permissions.
    395  1.1    cherry  * (9) Linux implementation doesn't have a limit on the maximum number of
    396  1.1    cherry  *     grants that can be persistently mapped in the frontend driver, but
    397  1.1    cherry  *     due to the frontent driver implementation it should never be bigger
    398  1.1    cherry  *     than RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST.
    399  1.1    cherry  *(10) The discard-secure property may be present and will be set to 1 if the
    400  1.1    cherry  *     backing device supports secure discard.
    401  1.1    cherry  *(11) Only used by Linux and NetBSD.
    402  1.1    cherry  */
    403  1.1    cherry 
    404  1.1    cherry /*
    405  1.1    cherry  * Multiple hardware queues/rings:
    406  1.1    cherry  * If supported, the backend will write the key "multi-queue-max-queues" to
    407  1.1    cherry  * the directory for that vbd, and set its value to the maximum supported
    408  1.1    cherry  * number of queues.
    409  1.1    cherry  * Frontends that are aware of this feature and wish to use it can write the
    410  1.1    cherry  * key "multi-queue-num-queues" with the number they wish to use, which must be
    411  1.1    cherry  * greater than zero, and no more than the value reported by the backend in
    412  1.1    cherry  * "multi-queue-max-queues".
    413  1.1    cherry  *
    414  1.1    cherry  * For frontends requesting just one queue, the usual event-channel and
    415  1.1    cherry  * ring-ref keys are written as before, simplifying the backend processing
    416  1.1    cherry  * to avoid distinguishing between a frontend that doesn't understand the
    417  1.1    cherry  * multi-queue feature, and one that does, but requested only one queue.
    418  1.1    cherry  *
    419  1.1    cherry  * Frontends requesting two or more queues must not write the toplevel
    420  1.1    cherry  * event-channel and ring-ref keys, instead writing those keys under sub-keys
    421  1.1    cherry  * having the name "queue-N" where N is the integer ID of the queue/ring for
    422  1.1    cherry  * which those keys belong. Queues are indexed from zero.
    423  1.1    cherry  * For example, a frontend with two queues must write the following set of
    424  1.1    cherry  * queue-related keys:
    425  1.1    cherry  *
    426  1.1    cherry  * /local/domain/1/device/vbd/0/multi-queue-num-queues = "2"
    427  1.1    cherry  * /local/domain/1/device/vbd/0/queue-0 = ""
    428  1.1    cherry  * /local/domain/1/device/vbd/0/queue-0/ring-ref = "<ring-ref#0>"
    429  1.1    cherry  * /local/domain/1/device/vbd/0/queue-0/event-channel = "<evtchn#0>"
    430  1.1    cherry  * /local/domain/1/device/vbd/0/queue-1 = ""
    431  1.1    cherry  * /local/domain/1/device/vbd/0/queue-1/ring-ref = "<ring-ref#1>"
    432  1.1    cherry  * /local/domain/1/device/vbd/0/queue-1/event-channel = "<evtchn#1>"
    433  1.1    cherry  *
    434  1.1    cherry  * It is also possible to use multiple queues/rings together with
    435  1.1    cherry  * feature multi-page ring buffer.
    436  1.1    cherry  * For example, a frontend requests two queues/rings and the size of each ring
    437  1.1    cherry  * buffer is two pages must write the following set of related keys:
    438  1.1    cherry  *
    439  1.1    cherry  * /local/domain/1/device/vbd/0/multi-queue-num-queues = "2"
    440  1.1    cherry  * /local/domain/1/device/vbd/0/ring-page-order = "1"
    441  1.1    cherry  * /local/domain/1/device/vbd/0/queue-0 = ""
    442  1.1    cherry  * /local/domain/1/device/vbd/0/queue-0/ring-ref0 = "<ring-ref#0>"
    443  1.1    cherry  * /local/domain/1/device/vbd/0/queue-0/ring-ref1 = "<ring-ref#1>"
    444  1.1    cherry  * /local/domain/1/device/vbd/0/queue-0/event-channel = "<evtchn#0>"
    445  1.1    cherry  * /local/domain/1/device/vbd/0/queue-1 = ""
    446  1.1    cherry  * /local/domain/1/device/vbd/0/queue-1/ring-ref0 = "<ring-ref#2>"
    447  1.1    cherry  * /local/domain/1/device/vbd/0/queue-1/ring-ref1 = "<ring-ref#3>"
    448  1.1    cherry  * /local/domain/1/device/vbd/0/queue-1/event-channel = "<evtchn#1>"
    449  1.1    cherry  *
    450  1.1    cherry  */
    451  1.1    cherry 
    452  1.1    cherry /*
    453  1.1    cherry  * STATE DIAGRAMS
    454  1.1    cherry  *
    455  1.1    cherry  *****************************************************************************
    456  1.1    cherry  *                                   Startup                                 *
    457  1.1    cherry  *****************************************************************************
    458  1.1    cherry  *
    459  1.1    cherry  * Tool stack creates front and back nodes with state XenbusStateInitialising.
    460  1.1    cherry  *
    461  1.1    cherry  * Front                                Back
    462  1.1    cherry  * =================================    =====================================
    463  1.1    cherry  * XenbusStateInitialising              XenbusStateInitialising
    464  1.1    cherry  *  o Query virtual device               o Query backend device identification
    465  1.1    cherry  *    properties.                          data.
    466  1.1    cherry  *  o Setup OS device instance.          o Open and validate backend device.
    467  1.1    cherry  *                                       o Publish backend features and
    468  1.1    cherry  *                                         transport parameters.
    469  1.1    cherry  *                                                      |
    470  1.1    cherry  *                                                      |
    471  1.1    cherry  *                                                      V
    472  1.1    cherry  *                                      XenbusStateInitWait
    473  1.1    cherry  *
    474  1.1    cherry  * o Query backend features and
    475  1.1    cherry  *   transport parameters.
    476  1.1    cherry  * o Allocate and initialize the
    477  1.1    cherry  *   request ring.
    478  1.1    cherry  * o Publish transport parameters
    479  1.1    cherry  *   that will be in effect during
    480  1.1    cherry  *   this connection.
    481  1.1    cherry  *              |
    482  1.1    cherry  *              |
    483  1.1    cherry  *              V
    484  1.1    cherry  * XenbusStateInitialised
    485  1.1    cherry  *
    486  1.1    cherry  *                                       o Query frontend transport parameters.
    487  1.1    cherry  *                                       o Connect to the request ring and
    488  1.1    cherry  *                                         event channel.
    489  1.1    cherry  *                                       o Publish backend device properties.
    490  1.1    cherry  *                                                      |
    491  1.1    cherry  *                                                      |
    492  1.1    cherry  *                                                      V
    493  1.1    cherry  *                                      XenbusStateConnected
    494  1.1    cherry  *
    495  1.1    cherry  *  o Query backend device properties.
    496  1.1    cherry  *  o Finalize OS virtual device
    497  1.1    cherry  *    instance.
    498  1.1    cherry  *              |
    499  1.1    cherry  *              |
    500  1.1    cherry  *              V
    501  1.1    cherry  * XenbusStateConnected
    502  1.1    cherry  *
    503  1.1    cherry  * Note: Drivers that do not support any optional features, or the negotiation
    504  1.1    cherry  *       of transport parameters, can skip certain states in the state machine:
    505  1.1    cherry  *
    506  1.1    cherry  *       o A frontend may transition to XenbusStateInitialised without
    507  1.1    cherry  *         waiting for the backend to enter XenbusStateInitWait.  In this
    508  1.1    cherry  *         case, default transport parameters are in effect and any
    509  1.1    cherry  *         transport parameters published by the frontend must contain
    510  1.1    cherry  *         their default values.
    511  1.1    cherry  *
    512  1.1    cherry  *       o A backend may transition to XenbusStateInitialised, bypassing
    513  1.1    cherry  *         XenbusStateInitWait, without waiting for the frontend to first
    514  1.1    cherry  *         enter the XenbusStateInitialised state.  In this case, default
    515  1.1    cherry  *         transport parameters are in effect and any transport parameters
    516  1.1    cherry  *         published by the backend must contain their default values.
    517  1.1    cherry  *
    518  1.1    cherry  *       Drivers that support optional features and/or transport parameter
    519  1.1    cherry  *       negotiation must tolerate these additional state transition paths.
    520  1.1    cherry  *       In general this means performing the work of any skipped state
    521  1.1    cherry  *       transition, if it has not already been performed, in addition to the
    522  1.1    cherry  *       work associated with entry into the current state.
    523  1.1    cherry  */
    524  1.1    cherry 
    525  1.1    cherry /*
    526  1.1    cherry  * REQUEST CODES.
    527  1.1    cherry  */
    528  1.1    cherry #define BLKIF_OP_READ              0
    529  1.1    cherry #define BLKIF_OP_WRITE             1
    530  1.1    cherry /*
    531  1.1    cherry  * All writes issued prior to a request with the BLKIF_OP_WRITE_BARRIER
    532  1.1    cherry  * operation code ("barrier request") must be completed prior to the
    533  1.1    cherry  * execution of the barrier request.  All writes issued after the barrier
    534  1.1    cherry  * request must not execute until after the completion of the barrier request.
    535  1.1    cherry  *
    536  1.1    cherry  * Optional.  See "feature-barrier" XenBus node documentation above.
    537  1.1    cherry  */
    538  1.1    cherry #define BLKIF_OP_WRITE_BARRIER     2
    539  1.1    cherry /*
    540  1.1    cherry  * Commit any uncommitted contents of the backing device's volatile cache
    541  1.1    cherry  * to stable storage.
    542  1.1    cherry  *
    543  1.1    cherry  * Optional.  See "feature-flush-cache" XenBus node documentation above.
    544  1.1    cherry  */
    545  1.1    cherry #define BLKIF_OP_FLUSH_DISKCACHE   3
    546  1.1    cherry /*
    547  1.1    cherry  * Used in SLES sources for device specific command packet
    548  1.1    cherry  * contained within the request. Reserved for that purpose.
    549  1.1    cherry  */
    550  1.1    cherry #define BLKIF_OP_RESERVED_1        4
    551  1.1    cherry /*
    552  1.1    cherry  * Indicate to the backend device that a region of storage is no longer in
    553  1.1    cherry  * use, and may be discarded at any time without impact to the client.  If
    554  1.1    cherry  * the BLKIF_DISCARD_SECURE flag is set on the request, all copies of the
    555  1.1    cherry  * discarded region on the device must be rendered unrecoverable before the
    556  1.1    cherry  * command returns.
    557  1.1    cherry  *
    558  1.1    cherry  * This operation is analogous to performing a trim (ATA) or unamp (SCSI),
    559  1.1    cherry  * command on a native device.
    560  1.1    cherry  *
    561  1.1    cherry  * More information about trim/unmap operations can be found at:
    562  1.1    cherry  * http://t13.org/Documents/UploadedDocuments/docs2008/
    563  1.1    cherry  *     e07154r6-Data_Set_Management_Proposal_for_ATA-ACS2.doc
    564  1.1    cherry  * http://www.seagate.com/staticfiles/support/disc/manuals/
    565  1.1    cherry  *     Interface%20manuals/100293068c.pdf
    566  1.1    cherry  *
    567  1.1    cherry  * Optional.  See "feature-discard", "discard-alignment",
    568  1.1    cherry  * "discard-granularity", and "discard-secure" in the XenBus node
    569  1.1    cherry  * documentation above.
    570  1.1    cherry  */
    571  1.1    cherry #define BLKIF_OP_DISCARD           5
    572  1.1    cherry 
    573  1.1    cherry /*
    574  1.1    cherry  * Recognized if "feature-max-indirect-segments" in present in the backend
    575  1.1    cherry  * xenbus info. The "feature-max-indirect-segments" node contains the maximum
    576  1.1    cherry  * number of segments allowed by the backend per request. If the node is
    577  1.1    cherry  * present, the frontend might use blkif_request_indirect structs in order to
    578  1.1    cherry  * issue requests with more than BLKIF_MAX_SEGMENTS_PER_REQUEST (11). The
    579  1.1    cherry  * maximum number of indirect segments is fixed by the backend, but the
    580  1.1    cherry  * frontend can issue requests with any number of indirect segments as long as
    581  1.1    cherry  * it's less than the number provided by the backend. The indirect_grefs field
    582  1.1    cherry  * in blkif_request_indirect should be filled by the frontend with the
    583  1.1    cherry  * grant references of the pages that are holding the indirect segments.
    584  1.1    cherry  * These pages are filled with an array of blkif_request_segment that hold the
    585  1.1    cherry  * information about the segments. The number of indirect pages to use is
    586  1.1    cherry  * determined by the number of segments an indirect request contains. Every
    587  1.1    cherry  * indirect page can contain a maximum of
    588  1.1    cherry  * (PAGE_SIZE / sizeof(struct blkif_request_segment)) segments, so to
    589  1.1    cherry  * calculate the number of indirect pages to use we have to do
    590  1.1    cherry  * ceil(indirect_segments / (PAGE_SIZE / sizeof(struct blkif_request_segment))).
    591  1.1    cherry  *
    592  1.1    cherry  * If a backend does not recognize BLKIF_OP_INDIRECT, it should *not*
    593  1.1    cherry  * create the "feature-max-indirect-segments" node!
    594  1.1    cherry  */
    595  1.1    cherry #define BLKIF_OP_INDIRECT          6
    596  1.1    cherry 
    597  1.1    cherry /*
    598  1.1    cherry  * Maximum scatter/gather segments per request.
    599  1.1    cherry  * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE.
    600  1.1    cherry  * NB. This could be 12 if the ring indexes weren't stored in the same page.
    601  1.1    cherry  */
    602  1.1    cherry #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
    603  1.1    cherry 
    604  1.1    cherry /*
    605  1.1    cherry  * Maximum number of indirect pages to use per request.
    606  1.1    cherry  */
    607  1.1    cherry #define BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST 8
    608  1.1    cherry 
    609  1.1    cherry /*
    610  1.1    cherry  * NB. first_sect and last_sect in blkif_request_segment, as well as
    611  1.1    cherry  * sector_number in blkif_request, are always expressed in 512-byte units.
    612  1.1    cherry  * However they must be properly aligned to the real sector size of the
    613  1.1    cherry  * physical disk, which is reported in the "physical-sector-size" node in
    614  1.1    cherry  * the backend xenbus info. Also the xenbus "sectors" node is expressed in
    615  1.1    cherry  * 512-byte units.
    616  1.1    cherry  */
    617  1.1    cherry struct blkif_request_segment {
    618  1.1    cherry     grant_ref_t gref;        /* reference to I/O buffer frame        */
    619  1.1    cherry     /* @first_sect: first sector in frame to transfer (inclusive).   */
    620  1.1    cherry     /* @last_sect: last sector in frame to transfer (inclusive).     */
    621  1.1    cherry     uint8_t     first_sect, last_sect;
    622  1.1    cherry };
    623  1.1    cherry 
    624  1.1    cherry /*
    625  1.1    cherry  * Starting ring element for any I/O request.
    626  1.1    cherry  */
    627  1.1    cherry struct blkif_request {
    628  1.1    cherry     uint8_t        operation;    /* BLKIF_OP_???                         */
    629  1.1    cherry     uint8_t        nr_segments;  /* number of segments                   */
    630  1.1    cherry     blkif_vdev_t   handle;       /* only for read/write requests         */
    631  1.1    cherry     uint64_t       id;           /* private guest value, echoed in resp  */
    632  1.1    cherry     blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
    633  1.1    cherry     struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
    634  1.1    cherry };
    635  1.1    cherry typedef struct blkif_request blkif_request_t;
    636  1.1    cherry 
    637  1.1    cherry /*
    638  1.1    cherry  * Cast to this structure when blkif_request.operation == BLKIF_OP_DISCARD
    639  1.1    cherry  * sizeof(struct blkif_request_discard) <= sizeof(struct blkif_request)
    640  1.1    cherry  */
    641  1.1    cherry struct blkif_request_discard {
    642  1.1    cherry     uint8_t        operation;    /* BLKIF_OP_DISCARD                     */
    643  1.1    cherry     uint8_t        flag;         /* BLKIF_DISCARD_SECURE or zero         */
    644  1.1    cherry #define BLKIF_DISCARD_SECURE (1<<0)  /* ignored if discard-secure=0      */
    645  1.1    cherry     blkif_vdev_t   handle;       /* same as for read/write requests      */
    646  1.1    cherry     uint64_t       id;           /* private guest value, echoed in resp  */
    647  1.1    cherry     blkif_sector_t sector_number;/* start sector idx on disk             */
    648  1.1    cherry     uint64_t       nr_sectors;   /* number of contiguous sectors to discard*/
    649  1.1    cherry };
    650  1.1    cherry typedef struct blkif_request_discard blkif_request_discard_t;
    651  1.1    cherry 
    652  1.1    cherry struct blkif_request_indirect {
    653  1.1    cherry     uint8_t        operation;    /* BLKIF_OP_INDIRECT                    */
    654  1.1    cherry     uint8_t        indirect_op;  /* BLKIF_OP_{READ/WRITE}                */
    655  1.1    cherry     uint16_t       nr_segments;  /* number of segments                   */
    656  1.1    cherry     uint64_t       id;           /* private guest value, echoed in resp  */
    657  1.1    cherry     blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
    658  1.1    cherry     blkif_vdev_t   handle;       /* same as for read/write requests      */
    659  1.1    cherry     grant_ref_t    indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
    660  1.3  jdolecek #ifdef __i386__
    661  1.3  jdolecek     uint64_t       pad;          /* Make it 64 byte aligned on i386      */
    662  1.1    cherry #endif
    663  1.3  jdolecek };
    664  1.1    cherry typedef struct blkif_request_indirect blkif_request_indirect_t;
    665  1.1    cherry 
    666  1.1    cherry struct blkif_response {
    667  1.1    cherry     uint64_t        id;              /* copied from request */
    668  1.1    cherry     uint8_t         operation;       /* copied from request */
    669  1.1    cherry     int16_t         status;          /* BLKIF_RSP_???       */
    670  1.1    cherry };
    671  1.1    cherry typedef struct blkif_response blkif_response_t;
    672  1.1    cherry 
    673  1.1    cherry /*
    674  1.1    cherry  * STATUS RETURN CODES.
    675  1.1    cherry  */
    676  1.1    cherry  /* Operation not supported (only happens on barrier writes). */
    677  1.1    cherry #define BLKIF_RSP_EOPNOTSUPP  -2
    678  1.1    cherry  /* Operation failed for some unspecified reason (-EIO). */
    679  1.1    cherry #define BLKIF_RSP_ERROR       -1
    680  1.1    cherry  /* Operation completed successfully. */
    681  1.1    cherry #define BLKIF_RSP_OKAY         0
    682  1.1    cherry 
    683  1.1    cherry /*
    684  1.1    cherry  * Generate blkif ring structures and types.
    685  1.1    cherry  */
    686  1.1    cherry DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response);
    687  1.1    cherry 
    688  1.1    cherry #define VDISK_CDROM        0x1
    689  1.1    cherry #define VDISK_REMOVABLE    0x2
    690  1.1    cherry #define VDISK_READONLY     0x4
    691  1.1    cherry 
    692  1.1    cherry #endif /* __XEN_PUBLIC_IO_BLKIF_H__ */
    693  1.1    cherry 
    694  1.1    cherry /*
    695  1.1    cherry  * Local variables:
    696  1.1    cherry  * mode: C
    697  1.1    cherry  * c-file-style: "BSD"
    698  1.1    cherry  * c-basic-offset: 4
    699  1.1    cherry  * tab-width: 4
    700  1.1    cherry  * indent-tabs-mode: nil
    701  1.1    cherry  * End:
    702  1.1    cherry  */
    703