Home | History | Annotate | Line # | Download | only in xen
privcmd.c revision 1.45
      1 /* $NetBSD: privcmd.c,v 1.44 2012/06/27 00:37:10 jym Exp $ */
      2 
      3 /*-
      4  * Copyright (c) 2004 Christian Limpach.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26  */
     27 
     28 
     29 #include <sys/cdefs.h>
     30 __KERNEL_RCSID(0, "$NetBSD: privcmd.c,v 1.44 2012/06/27 00:37:10 jym Exp $");
     31 
     32 #include <sys/param.h>
     33 #include <sys/systm.h>
     34 #include <sys/vnode.h>
     35 #include <sys/malloc.h>
     36 #include <sys/dirent.h>
     37 #include <sys/stat.h>
     38 #include <sys/proc.h>
     39 
     40 #include <miscfs/specfs/specdev.h>
     41 #include <miscfs/kernfs/kernfs.h>
     42 
     43 #include <uvm/uvm.h>
     44 #include <uvm/uvm_fault.h>
     45 #include <uvm/uvm_fault_i.h>
     46 
     47 #include <xen/kernfs_machdep.h>
     48 #include <xen/xenio.h>
     49 
     50 #define	PRIVCMD_MODE	(S_IRUSR)
     51 
     52 /* Magic value is used to mark invalid pages.
     53  * This must be a value within the page-offset.
     54  * Page-aligned values including 0x0 are used by the guest.
     55  */
     56 #define INVALID_PAGE	0xfff
     57 
     58 struct privcmd_object {
     59 	struct uvm_object uobj;
     60 	paddr_t *maddr; /* array of machine address to map */
     61 	int	npages;
     62 	int	domid;
     63 };
     64 
     65 int privcmd_nobjects = 0;
     66 
     67 static void privpgop_reference(struct uvm_object *);
     68 static void privpgop_detach(struct uvm_object *);
     69 static int privpgop_fault(struct uvm_faultinfo *, vaddr_t , struct vm_page **,
     70 			 int, int, vm_prot_t, int);
     71 static int privcmd_map_obj(struct vm_map *, vaddr_t, paddr_t *, int, int);
     72 
     73 
     74 static int
     75 privcmd_xen2bsd_errno(int error)
     76 {
     77 	/*
     78 	 * Xen uses System V error codes.
     79 	 * In order to keep bloat as minimal as possible,
     80 	 * only convert what really impact us.
     81 	 */
     82 
     83 	switch(-error) {
     84 	case 0:
     85 		return 0;
     86 	case 1:
     87 		return EPERM;
     88 	case 2:
     89 		return ENOENT;
     90 	case 3:
     91 		return ESRCH;
     92 	case 4:
     93 		return EINTR;
     94 	case 5:
     95 		return EIO;
     96 	case 6:
     97 		return ENXIO;
     98 	case 7:
     99 		return E2BIG;
    100 	case 8:
    101 		return ENOEXEC;
    102 	case 9:
    103 		return EBADF;
    104 	case 10:
    105 		return ECHILD;
    106 	case 11:
    107 		return EAGAIN;
    108 	case 12:
    109 		return ENOMEM;
    110 	case 13:
    111 		return EACCES;
    112 	case 14:
    113 		return EFAULT;
    114 	case 15:
    115 		return ENOTBLK;
    116 	case 16:
    117 		return EBUSY;
    118 	case 17:
    119 		return EEXIST;
    120 	case 18:
    121 		return EXDEV;
    122 	case 19:
    123 		return ENODEV;
    124 	case 20:
    125 		return ENOTDIR;
    126 	case 21:
    127 		return EISDIR;
    128 	case 22:
    129 		return EINVAL;
    130 	case 23:
    131 		return ENFILE;
    132 	case 24:
    133 		return EMFILE;
    134 	case 25:
    135 		return ENOTTY;
    136 	case 26:
    137 		return ETXTBSY;
    138 	case 27:
    139 		return EFBIG;
    140 	case 28:
    141 		return ENOSPC;
    142 	case 29:
    143 		return ESPIPE;
    144 	case 30:
    145 		return EROFS;
    146 	case 31:
    147 		return EMLINK;
    148 	case 32:
    149 		return EPIPE;
    150 	case 33:
    151 		return EDOM;
    152 	case 34:
    153 		return ERANGE;
    154 	case 35:
    155 		return EDEADLK;
    156 	case 36:
    157 		return ENAMETOOLONG;
    158 	case 37:
    159 		return ENOLCK;
    160 	case 38:
    161 		return ENOSYS;
    162 	case 39:
    163 		return ENOTEMPTY;
    164 	case 40:
    165 		return ELOOP;
    166 	case 42:
    167 		return ENOMSG;
    168 	case 43:
    169 		return EIDRM;
    170 	case 60:
    171 		return ENOSTR;
    172 	case 61:
    173 		return ENODATA;
    174 	case 62:
    175 		return ETIME;
    176 	case 63:
    177 		return ENOSR;
    178 	case 66:
    179 		return EREMOTE;
    180 	case 74:
    181 		return EBADMSG;
    182 	case 75:
    183 		return EOVERFLOW;
    184 	case 84:
    185 		return EILSEQ;
    186 	case 87:
    187 		return EUSERS;
    188 	case 88:
    189 		return ENOTSOCK;
    190 	case 89:
    191 		return EDESTADDRREQ;
    192 	case 90:
    193 		return EMSGSIZE;
    194 	case 91:
    195 		return EPROTOTYPE;
    196 	case 92:
    197 		return ENOPROTOOPT;
    198 	case 93:
    199 		return EPROTONOSUPPORT;
    200 	case 94:
    201 		return ESOCKTNOSUPPORT;
    202 	case 95:
    203 		return EOPNOTSUPP;
    204 	case 96:
    205 		return EPFNOSUPPORT;
    206 	case 97:
    207 		return EAFNOSUPPORT;
    208 	case 98:
    209 		return EADDRINUSE;
    210 	case 99:
    211 		return EADDRNOTAVAIL;
    212 	case 100:
    213 		return ENETDOWN;
    214 	case 101:
    215 		return ENETUNREACH;
    216 	case 102:
    217 		return ENETRESET;
    218 	case 103:
    219 		return ECONNABORTED;
    220 	case 104:
    221 		return ECONNRESET;
    222 	case 105:
    223 		return ENOBUFS;
    224 	case 106:
    225 		return EISCONN;
    226 	case 107:
    227 		return ENOTCONN;
    228 	case 108:
    229 		return ESHUTDOWN;
    230 	case 109:
    231 		return ETOOMANYREFS;
    232 	case 110:
    233 		return ETIMEDOUT;
    234 	case 111:
    235 		return ECONNREFUSED;
    236 	case 112:
    237 		return EHOSTDOWN;
    238 	case 113:
    239 		return EHOSTUNREACH;
    240 	case 114:
    241 		return EALREADY;
    242 	case 115:
    243 		return EINPROGRESS;
    244 	case 116:
    245 		return ESTALE;
    246 	case 122:
    247 		return EDQUOT;
    248 	default:
    249 		printf("unknown xen error code %d\n", -error);
    250 		return -error;
    251 	}
    252 }
    253 
    254 static int
    255 privcmd_ioctl(void *v)
    256 {
    257 	struct vop_ioctl_args /* {
    258 		const struct vnodeop_desc *a_desc;
    259 		struct vnode *a_vp;
    260 		u_long a_command;
    261 		void *a_data;
    262 		int a_fflag;
    263 		kauth_cred_t a_cred;
    264 	} */ *ap = v;
    265 	int error = 0;
    266 	paddr_t *maddr;
    267 
    268 	switch (ap->a_command) {
    269 	case IOCTL_PRIVCMD_HYPERCALL:
    270 	case IOCTL_PRIVCMD_HYPERCALL_OLD:
    271 	/*
    272 	 * oprivcmd_hypercall_t is privcmd_hypercall_t without the last entry
    273 	 */
    274 	{
    275 		privcmd_hypercall_t *hc = ap->a_data;
    276 		if (hc->op >= (PAGE_SIZE >> 5))
    277 			return EINVAL;
    278 		error = -EOPNOTSUPP;
    279 #if defined(__i386__)
    280 		__asm volatile (
    281 			"pushl %%ebx; pushl %%ecx; pushl %%edx;"
    282 			"pushl %%esi; pushl %%edi; "
    283 			"movl  4(%%eax),%%ebx ;"
    284 			"movl  8(%%eax),%%ecx ;"
    285 			"movl 12(%%eax),%%edx ;"
    286 			"movl 16(%%eax),%%esi ;"
    287 			"movl 20(%%eax),%%edi ;"
    288 			"movl   (%%eax),%%eax ;"
    289 			"shll $5,%%eax ;"
    290 			"addl $hypercall_page,%%eax ;"
    291 			"call *%%eax ;"
    292 			"popl %%edi; popl %%esi; popl %%edx;"
    293 			"popl %%ecx; popl %%ebx"
    294 			: "=a" (error) : "0" (ap->a_data) : "memory" );
    295 #endif /* __i386__ */
    296 #if defined(__x86_64__)
    297 		{
    298 		long i1, i2, i3;
    299 		__asm volatile (
    300 			"movq %8,%%r10; movq %9,%%r8;"
    301 			"shll $5,%%eax ;"
    302 			"addq $hypercall_page,%%rax ;"
    303 			"call *%%rax"
    304 			: "=a" (error), "=D" (i1),
    305 			  "=S" (i2), "=d" (i3)
    306 			: "0" ((unsigned int)hc->op),
    307 			  "1" (hc->arg[0]),
    308 			  "2" (hc->arg[1]),
    309 			  "3" (hc->arg[2]),
    310 			  "g" (hc->arg[3]),
    311 			  "g" (hc->arg[4])
    312 			: "r8", "r10", "memory" );
    313 		}
    314 #endif /* __x86_64__ */
    315 		if (ap->a_command == IOCTL_PRIVCMD_HYPERCALL) {
    316 			if (error >= 0) {
    317 				hc->retval = error;
    318 				error = 0;
    319 			} else {
    320 				/* error occured, return the errno */
    321 				error = privcmd_xen2bsd_errno(error);
    322 				hc->retval = 0;
    323 			}
    324 		} else {
    325 			error = privcmd_xen2bsd_errno(error);
    326 		}
    327 		break;
    328 	}
    329 	case IOCTL_PRIVCMD_MMAP:
    330 	{
    331 		int i, j;
    332 		privcmd_mmap_t *mcmd = ap->a_data;
    333 		privcmd_mmap_entry_t mentry;
    334 		vaddr_t va;
    335 		paddr_t ma;
    336 		struct vm_map *vmm = &curlwp->l_proc->p_vmspace->vm_map;
    337 
    338 		for (i = 0; i < mcmd->num; i++) {
    339 			error = copyin(&mcmd->entry[i], &mentry, sizeof(mentry));
    340 			if (error)
    341 				return error;
    342 			if (mentry.npages == 0)
    343 				return EINVAL;
    344 			if (mentry.va > VM_MAXUSER_ADDRESS)
    345 				return EINVAL;
    346 #if 0
    347 			if (mentry.va + (mentry.npages << PGSHIFT) >
    348 			    mrentry->vm_end)
    349 				return EINVAL;
    350 #endif
    351 			maddr = kmem_alloc(sizeof(paddr_t) * mentry.npages,
    352 			    KM_SLEEP);
    353 			if (maddr == NULL)
    354 				return ENOMEM;
    355 			va = mentry.va & ~PAGE_MASK;
    356 			ma = ((paddr_t)mentry.mfn) <<  PGSHIFT; /* XXX ??? */
    357 			for (j = 0; j < mentry.npages; j++) {
    358 				maddr[j] = ma;
    359 				ma += PAGE_SIZE;
    360 			}
    361 			error  = privcmd_map_obj(vmm, va, maddr,
    362 			    mentry.npages, mcmd->dom);
    363 			if (error)
    364 				return error;
    365 		}
    366 		break;
    367 	}
    368 	case IOCTL_PRIVCMD_MMAPBATCH:
    369 	{
    370 		int i;
    371 		privcmd_mmapbatch_t* pmb = ap->a_data;
    372 		vaddr_t va0;
    373 		u_long mfn;
    374 		paddr_t ma;
    375 		struct vm_map *vmm;
    376 		struct vm_map_entry *entry;
    377 		vm_prot_t prot;
    378 		vaddr_t trymap;
    379 
    380 		vmm = &curlwp->l_proc->p_vmspace->vm_map;
    381 		va0 = pmb->addr & ~PAGE_MASK;
    382 
    383 		if (pmb->num == 0)
    384 			return EINVAL;
    385 		if (va0 > VM_MAXUSER_ADDRESS)
    386 			return EINVAL;
    387 		if (((VM_MAXUSER_ADDRESS - va0) >> PGSHIFT) < pmb->num)
    388 			return EINVAL;
    389 
    390 		vm_map_lock_read(vmm);
    391 		if (!uvm_map_lookup_entry(vmm, va0, &entry)) {
    392 			vm_map_unlock_read(vmm);
    393 			return EINVAL;
    394 		}
    395 		prot = entry->protection;
    396 		vm_map_unlock_read(vmm);
    397 
    398 		maddr = kmem_alloc(sizeof(paddr_t) * pmb->num, KM_SLEEP);
    399 		if (maddr == NULL)
    400 			return ENOMEM;
    401 		/* get a page of KVA to check mappins */
    402 		trymap = uvm_km_alloc(kernel_map, PAGE_SIZE, PAGE_SIZE,
    403 		    UVM_KMF_VAONLY);
    404 		if (trymap == 0) {
    405 			kmem_free(maddr, sizeof(paddr_t) * pmb->num);
    406 			return ENOMEM;
    407 		}
    408 
    409 		for(i = 0; i < pmb->num; ++i) {
    410 			error = copyin(&pmb->arr[i], &mfn, sizeof(mfn));
    411 			if (error != 0) {
    412 				/* XXX: mappings */
    413 				pmap_update(pmap_kernel());
    414 				kmem_free(maddr, sizeof(paddr_t) * pmb->num);
    415 				uvm_km_free(kernel_map, trymap, PAGE_SIZE,
    416 				    UVM_KMF_VAONLY);
    417 				return error;
    418 			}
    419 			ma = ((paddr_t)mfn) << PGSHIFT;
    420 			if (pmap_enter_ma(pmap_kernel(), trymap, ma, 0,
    421 			    prot, PMAP_CANFAIL, pmb->dom)) {
    422 				mfn |= 0xF0000000;
    423 				copyout(&mfn, &pmb->arr[i], sizeof(mfn));
    424 				maddr[i] = INVALID_PAGE;
    425 			} else {
    426 				pmap_remove(pmap_kernel(), trymap,
    427 				    trymap + PAGE_SIZE);
    428 				maddr[i] = ma;
    429 			}
    430 		}
    431 		pmap_update(pmap_kernel());
    432 
    433 		error = privcmd_map_obj(vmm, va0, maddr, pmb->num, pmb->dom);
    434 		uvm_km_free(kernel_map, trymap, PAGE_SIZE, UVM_KMF_VAONLY);
    435 
    436 		if (error != 0)
    437 			return error;
    438 
    439 		break;
    440 	}
    441 	default:
    442 		error = EINVAL;
    443 	}
    444 
    445 	return error;
    446 }
    447 
    448 static struct uvm_pagerops privpgops = {
    449   .pgo_reference = privpgop_reference,
    450   .pgo_detach = privpgop_detach,
    451   .pgo_fault = privpgop_fault,
    452 };
    453 
    454 static void
    455 privpgop_reference(struct uvm_object *uobj)
    456 {
    457 	mutex_enter(uobj->vmobjlock);
    458 	uobj->uo_refs++;
    459 	mutex_exit(uobj->vmobjlock);
    460 }
    461 
    462 static void
    463 privpgop_detach(struct uvm_object *uobj)
    464 {
    465 	struct privcmd_object *pobj = (struct privcmd_object *)uobj;
    466 
    467 	mutex_enter(uobj->vmobjlock);
    468 	if (uobj->uo_refs > 1) {
    469 		uobj->uo_refs--;
    470 		mutex_exit(uobj->vmobjlock);
    471 		return;
    472 	}
    473 	mutex_exit(uobj->vmobjlock);
    474 	kmem_free(pobj->maddr, sizeof(paddr_t) * pobj->npages);
    475 	uvm_obj_destroy(uobj, true);
    476 	kmem_free(pobj, sizeof(struct privcmd_object));
    477 	privcmd_nobjects--;
    478 }
    479 
    480 static int
    481 privpgop_fault(struct uvm_faultinfo *ufi, vaddr_t vaddr, struct vm_page **pps,
    482     int npages, int centeridx, vm_prot_t access_type, int flags)
    483 {
    484 	struct vm_map_entry *entry = ufi->entry;
    485 	struct uvm_object *uobj = entry->object.uvm_obj;
    486 	struct privcmd_object *pobj = (struct privcmd_object*)uobj;
    487 	int maddr_i, i, error = 0;
    488 
    489 	/* compute offset from start of map */
    490 	maddr_i = (entry->offset + (vaddr - entry->start)) >> PAGE_SHIFT;
    491 	if (maddr_i + npages > pobj->npages) {
    492 		return EINVAL;
    493 	}
    494 	for (i = 0; i < npages; i++, maddr_i++, vaddr+= PAGE_SIZE) {
    495 		if ((flags & PGO_ALLPAGES) == 0 && i != centeridx)
    496 			continue;
    497 		if (pps[i] == PGO_DONTCARE)
    498 			continue;
    499 		if (pobj->maddr[maddr_i] == INVALID_PAGE) {
    500 			/* This has already been flagged as error. */
    501 			error = EFAULT;
    502 			break;
    503 		}
    504 		error = pmap_enter_ma(ufi->orig_map->pmap, vaddr,
    505 		    pobj->maddr[maddr_i], 0, ufi->entry->protection,
    506 		    PMAP_CANFAIL | ufi->entry->protection,
    507 		    pobj->domid);
    508 		if (error == ENOMEM) {
    509 			error = ERESTART;
    510 			break;
    511 		}
    512 		if (error) {
    513 			/* XXX for proper ptp accountings */
    514 			pmap_remove(ufi->orig_map->pmap, vaddr,
    515 			    vaddr + PAGE_SIZE);
    516 		}
    517 	}
    518 	pmap_update(ufi->orig_map->pmap);
    519 	uvmfault_unlockall(ufi, ufi->entry->aref.ar_amap, uobj);
    520 
    521 	if (error == ERESTART) {
    522 		uvm_wait("privpgop_fault");
    523 	}
    524 	return error;
    525 }
    526 
    527 static int
    528 privcmd_map_obj(struct vm_map *map, vaddr_t start, paddr_t *maddr,
    529 		int npages, int domid)
    530 {
    531 	struct privcmd_object *obj;
    532 	int error;
    533 	uvm_flag_t uvmflag;
    534 	vaddr_t newstart = start;
    535 	vm_prot_t prot;
    536 	off_t size = ((off_t)npages << PGSHIFT);
    537 
    538 	vm_map_lock_read(map);
    539 	/* get protections. This also check for validity of mapping */
    540 	if (uvm_map_checkprot(map, start, start + size - 1, VM_PROT_WRITE))
    541 		prot = VM_PROT_READ | VM_PROT_WRITE;
    542 	else if (uvm_map_checkprot(map, start, start + size - 1, VM_PROT_READ))
    543 		prot = VM_PROT_READ;
    544 	else {
    545 		printf("uvm_map_checkprot 0x%lx -> 0x%lx "
    546 		    "failed\n",
    547 		    start, (unsigned long)(start + size - 1));
    548 		vm_map_unlock_read(map);
    549 		kmem_free(maddr, sizeof(paddr_t) * npages);
    550 		return EINVAL;
    551 	}
    552 	vm_map_unlock_read(map);
    553 	/* remove current entries */
    554 	uvm_unmap1(map, start, start + size, 0);
    555 
    556 	obj = kmem_alloc(sizeof(struct privcmd_object), KM_SLEEP);
    557 	if (obj == NULL) {
    558 		kmem_free(maddr, sizeof(paddr_t) * npages);
    559 		return ENOMEM;
    560 	}
    561 
    562 	privcmd_nobjects++;
    563 	uvm_obj_init(&obj->uobj, &privpgops, true, 1);
    564 	mutex_enter(obj->uobj.vmobjlock);
    565 	obj->maddr = maddr;
    566 	obj->npages = npages;
    567 	obj->domid = domid;
    568 	mutex_exit(obj->uobj.vmobjlock);
    569 	uvmflag = UVM_MAPFLAG(prot, prot, UVM_INH_NONE, UVM_ADV_NORMAL,
    570 	    UVM_FLAG_FIXED | UVM_FLAG_NOMERGE);
    571 	error = uvm_map(map, &newstart, size, &obj->uobj, 0, 0, uvmflag);
    572 
    573 	if (error) {
    574 		if (obj)
    575 			obj->uobj.pgops->pgo_detach(&obj->uobj);
    576 		return error;
    577 	}
    578 	if (newstart != start) {
    579 		printf("uvm_map didn't give us back our vm space\n");
    580 		return EINVAL;
    581 	}
    582 	return 0;
    583 }
    584 
    585 static const struct kernfs_fileop privcmd_fileops[] = {
    586   { .kf_fileop = KERNFS_FILEOP_IOCTL, .kf_vop = privcmd_ioctl },
    587 };
    588 
    589 void
    590 xenprivcmd_init(void)
    591 {
    592 	kernfs_entry_t *dkt;
    593 	kfstype kfst;
    594 
    595 	if (!xendomain_is_privileged())
    596 		return;
    597 
    598 	kfst = KERNFS_ALLOCTYPE(privcmd_fileops);
    599 
    600 	KERNFS_ALLOCENTRY(dkt, M_TEMP, M_WAITOK);
    601 	KERNFS_INITENTRY(dkt, DT_REG, "privcmd", NULL, kfst, VREG,
    602 	    PRIVCMD_MODE);
    603 	kernfs_addentry(kernxen_pkt, dkt);
    604 }
    605