linux_sysfs.c revision cad31331
1/*
2 * (C) Copyright IBM Corporation 2006
3 * All Rights Reserved.
4 * Copyright 2012 Red Hat, Inc.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * on the rights to use, copy, modify, merge, publish, distribute, sub
10 * license, and/or sell copies of the Software, and to permit persons to whom
11 * the Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
20 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
24 */
25
26/**
27 * \file linux_sysfs.c
28 * Access PCI subsystem using Linux's sysfs interface.  This interface is
29 * available starting somewhere in the late 2.5.x kernel phase, and is the
30 * preferred method on all 2.6.x kernels.
31 *
32 * \author Ian Romanick <idr@us.ibm.com>
33 */
34
35#define _GNU_SOURCE
36
37#include <stdlib.h>
38#include <string.h>
39#include <stdio.h>
40#include <unistd.h>
41#include <sys/types.h>
42#include <sys/stat.h>
43#include <fcntl.h>
44#include <sys/mman.h>
45#include <dirent.h>
46#include <errno.h>
47
48#if defined(__i386__) || defined(__x86_64__) || defined(__arm__)
49#include <sys/io.h>
50#else
51#define inb(x) -1
52#define inw(x) -1
53#define inl(x) -1
54#define outb(x,y) do {} while (0)
55#define outw(x,y) do {} while (0)
56#define outl(x,y) do {} while (0)
57#define iopl(x) -1
58#endif
59
60#include "config.h"
61
62#ifdef HAVE_MTRR
63#include <asm/mtrr.h>
64#include <sys/ioctl.h>
65#endif
66
67#include "pciaccess.h"
68#include "pciaccess_private.h"
69#include "linux_devmem.h"
70
71static const struct pci_system_methods linux_sysfs_methods;
72
73#define SYS_BUS_PCI "/sys/bus/pci/devices"
74
75static int
76pci_device_linux_sysfs_read( struct pci_device * dev, void * data,
77			     pciaddr_t offset, pciaddr_t size,
78			     pciaddr_t * bytes_read );
79
80static int populate_entries(struct pci_system * pci_sys);
81
82/**
83 * Attempt to access PCI subsystem using Linux's sysfs interface.
84 */
85_pci_hidden int
86pci_system_linux_sysfs_create( void )
87{
88    int err = 0;
89    struct stat st;
90
91
92    /* If the directory "/sys/bus/pci/devices" exists, then the PCI subsystem
93     * can be accessed using this interface.
94     */
95
96    if ( stat( SYS_BUS_PCI, & st ) == 0 ) {
97	pci_sys = calloc( 1, sizeof( struct pci_system ) );
98	if ( pci_sys != NULL ) {
99	    pci_sys->methods = & linux_sysfs_methods;
100#ifdef HAVE_MTRR
101	    pci_sys->mtrr_fd = open("/proc/mtrr", O_WRONLY | O_CLOEXEC);
102#endif
103	    err = populate_entries(pci_sys);
104	}
105	else {
106	    err = ENOMEM;
107	}
108    }
109    else {
110	err = errno;
111    }
112
113    return err;
114}
115
116
117/**
118 * Filter out the names "." and ".." from the scanned sysfs entries.
119 *
120 * \param d  Directory entry being processed by \c scandir.
121 *
122 * \return
123 * Zero if the entry name matches either "." or "..", non-zero otherwise.
124 *
125 * \sa scandir, populate_entries
126 */
127static int
128scan_sys_pci_filter( const struct dirent * d )
129{
130    return !((strcmp( d->d_name, "." ) == 0)
131	     || (strcmp( d->d_name, ".." ) == 0));
132}
133
134
135int
136populate_entries( struct pci_system * p )
137{
138    struct dirent ** devices = NULL;
139    int n;
140    int i;
141    int err = 0;
142
143
144    n = scandir( SYS_BUS_PCI, & devices, scan_sys_pci_filter, alphasort );
145    if ( n > 0 ) {
146	p->num_devices = n;
147	p->devices = calloc( n, sizeof( struct pci_device_private ) );
148
149	if (p->devices != NULL) {
150	    for (i = 0 ; i < n ; i++) {
151		uint8_t config[48];
152		pciaddr_t bytes;
153		unsigned dom, bus, dev, func;
154		struct pci_device_private *device =
155			(struct pci_device_private *) &p->devices[i];
156
157
158		sscanf(devices[i]->d_name, "%04x:%02x:%02x.%1u",
159		       & dom, & bus, & dev, & func);
160
161		device->base.domain = dom;
162		device->base.bus = bus;
163		device->base.dev = dev;
164		device->base.func = func;
165
166
167		err = pci_device_linux_sysfs_read(& device->base, config, 0,
168						  48, & bytes);
169		if ((bytes == 48) && !err) {
170		    device->base.vendor_id = (uint16_t)config[0]
171			+ ((uint16_t)config[1] << 8);
172		    device->base.device_id = (uint16_t)config[2]
173			+ ((uint16_t)config[3] << 8);
174		    device->base.device_class = (uint32_t)config[9]
175			+ ((uint32_t)config[10] << 8)
176			+ ((uint32_t)config[11] << 16);
177		    device->base.revision = config[8];
178		    device->base.subvendor_id = (uint16_t)config[44]
179			+ ((uint16_t)config[45] << 8);
180		    device->base.subdevice_id = (uint16_t)config[46]
181			+ ((uint16_t)config[47] << 8);
182		}
183
184		if (err) {
185		    break;
186		}
187	    }
188	}
189	else {
190	    err = ENOMEM;
191	}
192    }
193
194    for (i = 0; i < n; i++)
195	free(devices[i]);
196    free(devices);
197
198    if (err) {
199	free(p->devices);
200	p->devices = NULL;
201    }
202
203    return err;
204}
205
206
207static int
208pci_device_linux_sysfs_probe( struct pci_device * dev )
209{
210    char     name[256];
211    uint8_t  config[256];
212    char     resource[512];
213    int fd;
214    pciaddr_t bytes;
215    unsigned i;
216    int err;
217
218
219    err = pci_device_linux_sysfs_read( dev, config, 0, 256, & bytes );
220    if ( bytes >= 64 ) {
221	struct pci_device_private *priv = (struct pci_device_private *) dev;
222
223	dev->irq = config[60];
224	priv->header_type = config[14];
225
226
227	/* The PCI config registers can be used to obtain information
228	 * about the memory and I/O regions for the device.  However,
229	 * doing so requires some tricky parsing (to correctly handle
230	 * 64-bit memory regions) and requires writing to the config
231	 * registers.  Since we'd like to avoid having to deal with the
232	 * parsing issues and non-root users can write to PCI config
233	 * registers, we use a different file in the device's sysfs
234	 * directory called "resource".
235	 *
236	 * The resource file contains all of the needed information in
237	 * a format that is consistent across all platforms.  Each BAR
238	 * and the expansion ROM have a single line of data containing
239	 * 3, 64-bit hex values:  the first address in the region,
240	 * the last address in the region, and the region's flags.
241	 */
242	snprintf( name, 255, "%s/%04x:%02x:%02x.%1u/resource",
243		  SYS_BUS_PCI,
244		  dev->domain,
245		  dev->bus,
246		  dev->dev,
247		  dev->func );
248	fd = open( name, O_RDONLY | O_CLOEXEC);
249	if ( fd != -1 ) {
250	    char * next;
251	    pciaddr_t  low_addr;
252	    pciaddr_t  high_addr;
253	    pciaddr_t  flags;
254
255
256	    bytes = read( fd, resource, 512 );
257	    resource[511] = '\0';
258
259	    close( fd );
260
261	    next = resource;
262	    for ( i = 0 ; i < 6 ; i++ ) {
263
264		dev->regions[i].base_addr = strtoull( next, & next, 16 );
265		high_addr = strtoull( next, & next, 16 );
266		flags = strtoull( next, & next, 16 );
267
268		if ( dev->regions[i].base_addr != 0 ) {
269		    dev->regions[i].size = (high_addr
270					    - dev->regions[i].base_addr) + 1;
271
272		    dev->regions[i].is_IO = (flags & 0x01);
273		    dev->regions[i].is_64 = (flags & 0x04);
274		    dev->regions[i].is_prefetchable = (flags & 0x08);
275		}
276	    }
277
278	    low_addr = strtoull( next, & next, 16 );
279	    high_addr = strtoull( next, & next, 16 );
280	    flags = strtoull( next, & next, 16 );
281	    if ( low_addr != 0 ) {
282		priv->rom_base = low_addr;
283		dev->rom_size = (high_addr - low_addr) + 1;
284	    }
285	}
286    }
287
288    return err;
289}
290
291
292static int
293pci_device_linux_sysfs_read_rom( struct pci_device * dev, void * buffer )
294{
295    char name[256];
296    int fd;
297    struct stat  st;
298    int err = 0;
299    size_t rom_size;
300    size_t total_bytes;
301
302
303    snprintf( name, 255, "%s/%04x:%02x:%02x.%1u/rom",
304	      SYS_BUS_PCI,
305	      dev->domain,
306	      dev->bus,
307	      dev->dev,
308	      dev->func );
309
310    fd = open( name, O_RDWR | O_CLOEXEC);
311    if ( fd == -1 ) {
312#ifdef LINUX_ROM
313	/* If reading the ROM using sysfs fails, fall back to the old
314	 * /dev/mem based interface.
315	 * disable this for newer kernels using configure
316	 */
317	return pci_device_linux_devmem_read_rom(dev, buffer);
318#else
319	return errno;
320#endif
321    }
322
323
324    if ( fstat( fd, & st ) == -1 ) {
325	close( fd );
326	return errno;
327    }
328
329    rom_size = st.st_size;
330    if ( rom_size == 0 )
331	rom_size = 0x10000;
332
333    /* This is a quirky thing on Linux.  Even though the ROM and the file
334     * for the ROM in sysfs are read-only, the string "1" must be written to
335     * the file to enable the ROM.  After the data has been read, "0" must be
336     * written to the file to disable the ROM.
337     */
338    write( fd, "1", 1 );
339    lseek( fd, 0, SEEK_SET );
340
341    for ( total_bytes = 0 ; total_bytes < rom_size ; /* empty */ ) {
342	const int bytes = read( fd, (char *) buffer + total_bytes,
343				rom_size - total_bytes );
344	if ( bytes == -1 ) {
345	    err = errno;
346	    break;
347	}
348	else if ( bytes == 0 ) {
349	    break;
350	}
351
352	total_bytes += bytes;
353    }
354
355
356    lseek( fd, 0, SEEK_SET );
357    write( fd, "0", 1 );
358
359    close( fd );
360    return err;
361}
362
363
364static int
365pci_device_linux_sysfs_read( struct pci_device * dev, void * data,
366			     pciaddr_t offset, pciaddr_t size,
367			     pciaddr_t * bytes_read )
368{
369    char name[256];
370    pciaddr_t temp_size = size;
371    int err = 0;
372    int fd;
373    char *data_bytes = data;
374
375    if ( bytes_read != NULL ) {
376	*bytes_read = 0;
377    }
378
379    /* Each device has a directory under sysfs.  Within that directory there
380     * is a file named "config".  This file used to access the PCI config
381     * space.  It is used here to obtain most of the information about the
382     * device.
383     */
384    snprintf( name, 255, "%s/%04x:%02x:%02x.%1u/config",
385	      SYS_BUS_PCI,
386	      dev->domain,
387	      dev->bus,
388	      dev->dev,
389	      dev->func );
390
391    fd = open( name, O_RDONLY | O_CLOEXEC);
392    if ( fd == -1 ) {
393	return errno;
394    }
395
396
397    while ( temp_size > 0 ) {
398	const ssize_t bytes = pread64( fd, data_bytes, temp_size, offset );
399
400	/* If zero bytes were read, then we assume it's the end of the
401	 * config file.
402	 */
403	if (bytes == 0)
404	    break;
405	if ( bytes < 0 ) {
406	    err = errno;
407	    break;
408	}
409
410	temp_size -= bytes;
411	offset += bytes;
412	data_bytes += bytes;
413    }
414
415    if ( bytes_read != NULL ) {
416	*bytes_read = size - temp_size;
417    }
418
419    close( fd );
420    return err;
421}
422
423
424static int
425pci_device_linux_sysfs_write( struct pci_device * dev, const void * data,
426			     pciaddr_t offset, pciaddr_t size,
427			     pciaddr_t * bytes_written )
428{
429    char name[256];
430    pciaddr_t temp_size = size;
431    int err = 0;
432    int fd;
433    const char *data_bytes = data;
434
435    if ( bytes_written != NULL ) {
436	*bytes_written = 0;
437    }
438
439    /* Each device has a directory under sysfs.  Within that directory there
440     * is a file named "config".  This file used to access the PCI config
441     * space.  It is used here to obtain most of the information about the
442     * device.
443     */
444    snprintf( name, 255, "%s/%04x:%02x:%02x.%1u/config",
445	      SYS_BUS_PCI,
446	      dev->domain,
447	      dev->bus,
448	      dev->dev,
449	      dev->func );
450
451    fd = open( name, O_WRONLY | O_CLOEXEC);
452    if ( fd == -1 ) {
453	return errno;
454    }
455
456
457    while ( temp_size > 0 ) {
458	const ssize_t bytes = pwrite64( fd, data_bytes, temp_size, offset );
459
460	/* If zero bytes were written, then we assume it's the end of the
461	 * config file.
462	 */
463	if ( bytes == 0 )
464	    break;
465	if ( bytes < 0 ) {
466	    err = errno;
467	    break;
468	}
469
470	temp_size -= bytes;
471	offset += bytes;
472	data_bytes += bytes;
473    }
474
475    if ( bytes_written != NULL ) {
476	*bytes_written = size - temp_size;
477    }
478
479    close( fd );
480    return err;
481}
482
483static int
484pci_device_linux_sysfs_map_range_wc(struct pci_device *dev,
485				    struct pci_device_mapping *map)
486{
487    char name[256];
488    int fd;
489    const int prot = ((map->flags & PCI_DEV_MAP_FLAG_WRITABLE) != 0)
490        ? (PROT_READ | PROT_WRITE) : PROT_READ;
491    const int open_flags = ((map->flags & PCI_DEV_MAP_FLAG_WRITABLE) != 0)
492        ? O_RDWR : O_RDONLY;
493    const off_t offset = map->base - dev->regions[map->region].base_addr;
494
495    snprintf(name, 255, "%s/%04x:%02x:%02x.%1u/resource%u_wc",
496	     SYS_BUS_PCI,
497	     dev->domain,
498	     dev->bus,
499	     dev->dev,
500	     dev->func,
501	     map->region);
502    fd = open(name, open_flags | O_CLOEXEC);
503    if (fd == -1)
504	    return errno;
505
506    map->memory = mmap(NULL, map->size, prot, MAP_SHARED, fd, offset);
507    if (map->memory == MAP_FAILED) {
508        map->memory = NULL;
509	close(fd);
510	return errno;
511    }
512
513    close(fd);
514
515    return 0;
516}
517
518/**
519 * Map a memory region for a device using the Linux sysfs interface.
520 *
521 * \param dev   Device whose memory region is to be mapped.
522 * \param map   Parameters of the mapping that is to be created.
523 *
524 * \return
525 * Zero on success or an \c errno value on failure.
526 *
527 * \sa pci_device_map_rrange, pci_device_linux_sysfs_unmap_range
528 *
529 * \todo
530 * Some older 2.6.x kernels don't implement the resourceN files.  On those
531 * systems /dev/mem must be used.  On these systems it is also possible that
532 * \c mmap64 may need to be used.
533 */
534static int
535pci_device_linux_sysfs_map_range(struct pci_device *dev,
536                                 struct pci_device_mapping *map)
537{
538    char name[256];
539    int fd;
540    int err = 0;
541    const int prot = ((map->flags & PCI_DEV_MAP_FLAG_WRITABLE) != 0)
542        ? (PROT_READ | PROT_WRITE) : PROT_READ;
543    const int open_flags = ((map->flags & PCI_DEV_MAP_FLAG_WRITABLE) != 0)
544        ? O_RDWR : O_RDONLY;
545    const off_t offset = map->base - dev->regions[map->region].base_addr;
546#ifdef HAVE_MTRR
547    struct mtrr_sentry sentry = {
548	.base = map->base,
549        .size = map->size,
550	.type = MTRR_TYPE_UNCACHABLE
551    };
552#endif
553
554    /* For WC mappings, try sysfs resourceN_wc file first */
555    if ((map->flags & PCI_DEV_MAP_FLAG_WRITE_COMBINE) &&
556	!pci_device_linux_sysfs_map_range_wc(dev, map))
557	    return 0;
558
559    snprintf(name, 255, "%s/%04x:%02x:%02x.%1u/resource%u",
560             SYS_BUS_PCI,
561             dev->domain,
562             dev->bus,
563             dev->dev,
564             dev->func,
565             map->region);
566
567    fd = open(name, open_flags | O_CLOEXEC);
568    if (fd == -1) {
569        return errno;
570    }
571
572
573    map->memory = mmap(NULL, map->size, prot, MAP_SHARED, fd, offset);
574    if (map->memory == MAP_FAILED) {
575        map->memory = NULL;
576	close(fd);
577	return errno;
578    }
579
580#ifdef HAVE_MTRR
581    if ((map->flags & PCI_DEV_MAP_FLAG_CACHABLE) != 0) {
582        sentry.type = MTRR_TYPE_WRBACK;
583    } else if ((map->flags & PCI_DEV_MAP_FLAG_WRITE_COMBINE) != 0) {
584        sentry.type = MTRR_TYPE_WRCOMB;
585    }
586
587    if (pci_sys->mtrr_fd != -1 && sentry.type != MTRR_TYPE_UNCACHABLE) {
588	if (ioctl(pci_sys->mtrr_fd, MTRRIOC_ADD_ENTRY, &sentry) < 0) {
589	    /* FIXME: Should we report an error in this case?
590	     */
591	    fprintf(stderr, "error setting MTRR "
592		    "(base = 0x%08lx, size = 0x%08x, type = %u) %s (%d)\n",
593		    sentry.base, sentry.size, sentry.type,
594		    strerror(errno), errno);
595/*            err = errno;*/
596	}
597	/* KLUDGE ALERT -- rewrite the PTEs to turn off the CD and WT bits */
598	mprotect (map->memory, map->size, PROT_NONE);
599	err = mprotect (map->memory, map->size, PROT_READ|PROT_WRITE);
600
601	if (err != 0) {
602	    fprintf(stderr, "mprotect(PROT_READ | PROT_WRITE) failed: %s\n",
603		    strerror(errno));
604	    fprintf(stderr, "remapping without mprotect performance kludge.\n");
605
606	    munmap(map->memory, map->size);
607	    map->memory = mmap(NULL, map->size, prot, MAP_SHARED, fd, offset);
608	    if (map->memory == MAP_FAILED) {
609		map->memory = NULL;
610		close(fd);
611		return errno;
612	    }
613	}
614    }
615#endif
616
617    close(fd);
618
619    return 0;
620}
621
622/**
623 * Unmap a memory region for a device using the Linux sysfs interface.
624 *
625 * \param dev   Device whose memory region is to be unmapped.
626 * \param map   Parameters of the mapping that is to be destroyed.
627 *
628 * \return
629 * Zero on success or an \c errno value on failure.
630 *
631 * \sa pci_device_map_rrange, pci_device_linux_sysfs_map_range
632 *
633 * \todo
634 * Some older 2.6.x kernels don't implement the resourceN files.  On those
635 * systems /dev/mem must be used.  On these systems it is also possible that
636 * \c mmap64 may need to be used.
637 */
638static int
639pci_device_linux_sysfs_unmap_range(struct pci_device *dev,
640				   struct pci_device_mapping *map)
641{
642    int err = 0;
643#ifdef HAVE_MTRR
644    struct mtrr_sentry sentry = {
645	.base = map->base,
646        .size = map->size,
647	.type = MTRR_TYPE_UNCACHABLE
648    };
649#endif
650
651    err = pci_device_generic_unmap_range (dev, map);
652    if (err)
653	return err;
654
655#ifdef HAVE_MTRR
656    if ((map->flags & PCI_DEV_MAP_FLAG_CACHABLE) != 0) {
657        sentry.type = MTRR_TYPE_WRBACK;
658    } else if ((map->flags & PCI_DEV_MAP_FLAG_WRITE_COMBINE) != 0) {
659        sentry.type = MTRR_TYPE_WRCOMB;
660    }
661
662    if (pci_sys->mtrr_fd != -1 && sentry.type != MTRR_TYPE_UNCACHABLE) {
663	if (ioctl(pci_sys->mtrr_fd, MTRRIOC_DEL_ENTRY, &sentry) < 0) {
664	    /* FIXME: Should we report an error in this case?
665	     */
666	    fprintf(stderr, "error setting MTRR "
667		    "(base = 0x%08lx, size = 0x%08x, type = %u) %s (%d)\n",
668		    sentry.base, sentry.size, sentry.type,
669		    strerror(errno), errno);
670/*            err = errno;*/
671	}
672    }
673#endif
674
675    return err;
676}
677
678static void pci_device_linux_sysfs_enable(struct pci_device *dev)
679{
680    char name[256];
681    int fd;
682
683    snprintf( name, 255, "%s/%04x:%02x:%02x.%1u/enable",
684	      SYS_BUS_PCI,
685	      dev->domain,
686	      dev->bus,
687	      dev->dev,
688	      dev->func );
689
690    fd = open( name, O_RDWR | O_CLOEXEC);
691    if (fd == -1)
692       return;
693
694    write( fd, "1", 1 );
695    close(fd);
696}
697
698static int pci_device_linux_sysfs_boot_vga(struct pci_device *dev)
699{
700    char name[256];
701    char reply[3];
702    int fd, bytes_read;
703    int ret = 0;
704
705    snprintf( name, 255, "%s/%04x:%02x:%02x.%1u/boot_vga",
706	      SYS_BUS_PCI,
707	      dev->domain,
708	      dev->bus,
709	      dev->dev,
710	      dev->func );
711
712    fd = open( name, O_RDONLY | O_CLOEXEC);
713    if (fd == -1)
714       return 0;
715
716    bytes_read = read(fd, reply, 1);
717    if (bytes_read != 1)
718	goto out;
719    if (reply[0] == '1')
720	ret = 1;
721out:
722    close(fd);
723    return ret;
724}
725
726static int pci_device_linux_sysfs_has_kernel_driver(struct pci_device *dev)
727{
728    char name[256];
729    struct stat dummy;
730    int ret;
731
732    snprintf( name, 255, "%s/%04x:%02x:%02x.%1u/driver",
733	      SYS_BUS_PCI,
734	      dev->domain,
735	      dev->bus,
736	      dev->dev,
737	      dev->func );
738
739    ret = stat(name, &dummy);
740    if (ret < 0)
741	return 0;
742    return 1;
743}
744
745static struct pci_io_handle *
746pci_device_linux_sysfs_open_device_io(struct pci_io_handle *ret,
747				      struct pci_device *dev, int bar,
748				      pciaddr_t base, pciaddr_t size)
749{
750    char name[PATH_MAX];
751
752    snprintf(name, PATH_MAX, "%s/%04x:%02x:%02x.%1u/resource%d",
753	     SYS_BUS_PCI, dev->domain, dev->bus, dev->dev, dev->func, bar);
754
755    ret->fd = open(name, O_RDWR | O_CLOEXEC);
756
757    if (ret->fd < 0)
758	return NULL;
759
760    ret->base = base;
761    ret->size = size;
762
763    return ret;
764}
765
766static struct pci_io_handle *
767pci_device_linux_sysfs_open_legacy_io(struct pci_io_handle *ret,
768				      struct pci_device *dev, pciaddr_t base,
769				      pciaddr_t size)
770{
771    char name[PATH_MAX];
772
773    /* First check if there's a legacy io method for the device */
774    while (dev) {
775	snprintf(name, PATH_MAX, "/sys/class/pci_bus/%04x:%02x/legacy_io",
776		 dev->domain, dev->bus);
777
778	ret->fd = open(name, O_RDWR | O_CLOEXEC);
779	if (ret->fd >= 0)
780	    break;
781
782	dev = pci_device_get_parent_bridge(dev);
783    }
784
785    /*
786     * You would think you'd want to use /dev/port here.  Don't make that
787     * mistake, /dev/port only does byte-wide i/o cycles which means it
788     * doesn't work.  If you think this is stupid, well, you're right.
789     */
790
791    /* If we've no other choice, iopl */
792    if (ret->fd < 0) {
793	if (iopl(3))
794	    return NULL;
795    }
796
797    ret->base = base;
798    ret->size = size;
799
800    return ret;
801}
802
803static void
804pci_device_linux_sysfs_close_io(struct pci_device *dev,
805				struct pci_io_handle *handle)
806{
807    if (handle->fd > -1)
808	close(handle->fd);
809}
810
811static uint32_t
812pci_device_linux_sysfs_read32(struct pci_io_handle *handle, uint32_t port)
813{
814    uint32_t ret;
815
816    if (handle->fd > -1)
817	pread(handle->fd, &ret, 4, port + handle->base);
818    else
819	ret = inl(port + handle->base);
820
821    return ret;
822}
823
824static uint16_t
825pci_device_linux_sysfs_read16(struct pci_io_handle *handle, uint32_t port)
826{
827    uint16_t ret;
828
829    if (handle->fd > -1)
830	pread(handle->fd, &ret, 2, port + handle->base);
831    else
832	ret = inw(port + handle->base);
833
834    return ret;
835}
836
837static uint8_t
838pci_device_linux_sysfs_read8(struct pci_io_handle *handle, uint32_t port)
839{
840    uint8_t ret;
841
842    if (handle->fd > -1)
843	pread(handle->fd, &ret, 1, port + handle->base);
844    else
845	ret = inb(port + handle->base);
846
847    return ret;
848}
849
850static void
851pci_device_linux_sysfs_write32(struct pci_io_handle *handle, uint32_t port,
852			       uint32_t data)
853{
854    if (handle->fd > -1)
855	pwrite(handle->fd, &data, 4, port + handle->base);
856    else
857	outl(data, port + handle->base);
858}
859
860static void
861pci_device_linux_sysfs_write16(struct pci_io_handle *handle, uint32_t port,
862			       uint16_t data)
863{
864    if (handle->fd > -1)
865	pwrite(handle->fd, &data, 2, port + handle->base);
866    else
867	outw(data, port + handle->base);
868}
869
870static void
871pci_device_linux_sysfs_write8(struct pci_io_handle *handle, uint32_t port,
872			      uint8_t data)
873{
874    if (handle->fd > -1)
875	pwrite(handle->fd, &data, 1, port + handle->base);
876    else
877	outb(data, port + handle->base);
878}
879
880static int
881pci_device_linux_sysfs_map_legacy(struct pci_device *dev, pciaddr_t base,
882				  pciaddr_t size, unsigned map_flags, void **addr)
883{
884    char name[PATH_MAX];
885    int flags = O_RDONLY;
886    int prot = PROT_READ;
887    int fd;
888    int ret=0;
889
890    if (map_flags & PCI_DEV_MAP_FLAG_WRITABLE) {
891	flags = O_RDWR; /* O_RDWR != O_WRONLY | O_RDONLY */;
892	prot |= PROT_WRITE;
893    }
894
895    /* First check if there's a legacy memory method for the device */
896    while (dev) {
897	snprintf(name, PATH_MAX, "/sys/class/pci_bus/%04x:%02x/legacy_mem",
898		 dev->domain, dev->bus);
899
900	fd = open(name, flags | O_CLOEXEC);
901	if (fd >= 0)
902	    break;
903
904	dev = pci_device_get_parent_bridge(dev);
905    }
906
907    /* If not, /dev/mem is the best we can do */
908    if (!dev)
909	fd = open("/dev/mem", flags | O_CLOEXEC);
910
911    if (fd < 0)
912	return errno;
913
914    *addr = mmap(NULL, size, prot, MAP_SHARED, fd, base);
915    if (*addr == MAP_FAILED) {
916	ret = errno;
917    }
918
919    close(fd);
920    return ret;
921}
922
923static int
924pci_device_linux_sysfs_unmap_legacy(struct pci_device *dev, void *addr, pciaddr_t size)
925{
926    return munmap(addr, size);
927}
928
929
930static void
931pci_system_linux_destroy(void)
932{
933#ifdef HAVE_MTRR
934	if (pci_sys->mtrr_fd != -1)
935		close(pci_sys->mtrr_fd);
936#endif
937}
938
939static const struct pci_system_methods linux_sysfs_methods = {
940    .destroy = pci_system_linux_destroy,
941    .destroy_device = NULL,
942    .read_rom = pci_device_linux_sysfs_read_rom,
943    .probe = pci_device_linux_sysfs_probe,
944    .map_range = pci_device_linux_sysfs_map_range,
945    .unmap_range = pci_device_linux_sysfs_unmap_range,
946
947    .read = pci_device_linux_sysfs_read,
948    .write = pci_device_linux_sysfs_write,
949
950    .fill_capabilities = pci_fill_capabilities_generic,
951    .enable = pci_device_linux_sysfs_enable,
952    .boot_vga = pci_device_linux_sysfs_boot_vga,
953    .has_kernel_driver = pci_device_linux_sysfs_has_kernel_driver,
954
955    .open_device_io = pci_device_linux_sysfs_open_device_io,
956    .open_legacy_io = pci_device_linux_sysfs_open_legacy_io,
957    .close_io = pci_device_linux_sysfs_close_io,
958    .read32 = pci_device_linux_sysfs_read32,
959    .read16 = pci_device_linux_sysfs_read16,
960    .read8 = pci_device_linux_sysfs_read8,
961    .write32 = pci_device_linux_sysfs_write32,
962    .write16 = pci_device_linux_sysfs_write16,
963    .write8 = pci_device_linux_sysfs_write8,
964
965    .map_legacy = pci_device_linux_sysfs_map_legacy,
966    .unmap_legacy = pci_device_linux_sysfs_unmap_legacy,
967};
968